From 89ad9186e831a6ae765583242ca065da6ce3330e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 04:35:00 +0000
Subject: [PATCH 001/106] torch wip

---
 python/ark/data_type.py  | 33 +++++++++++++++++++++++----------
 python/ark/torch_mock.py | 11 +++++++++++
 2 files changed, 34 insertions(+), 10 deletions(-)
 create mode 100644 python/ark/torch_mock.py

diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index fe95d0d88..de64c1d7d 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -3,26 +3,29 @@
 
 import numpy
 from . import _ark_core
-
+try:
+    import torch
+except ImportError:
+    from . import torch_mock as torch
 
 _REGISTRY_DATA_TYPE = {
-    "fp32": {"np": numpy.float32},
-    "fp16": {"np": numpy.float16},
-    "bf16": {"np": None},
-    "int32": {"np": numpy.int32},
-    "uint32": {"np": numpy.uint32},
-    "int8": {"np": numpy.int8},
-    "uint8": {"np": numpy.uint8},
-    "byte": {"np": numpy.ubyte},
+    "fp32": {"np": numpy.float32, "torch": torch.float32},
+    "fp16": {"np": numpy.float16, "torch": torch.float16},
+    "bf16": {"np": None, "torch": torch.bfloat16},
+    "int32": {"np": numpy.int32, "torch": torch.int32},
+    "uint32": {"np": numpy.uint32, "torch": None},
+    "int8": {"np": numpy.int8, "torch": torch.int8},
+    "uint8": {"np": numpy.uint8, "torch": torch.uint8},
+    "byte": {"np": numpy.ubyte, "torch": torch.uint8},
 }
 
-
 class MetaDataType(type):
     def __new__(cls, name, bases, attrs):
         new_class = super().__new__(cls, name, bases, attrs)
         if name in _REGISTRY_DATA_TYPE:
             reg = _REGISTRY_DATA_TYPE[name]
             new_class.to_numpy = staticmethod(lambda: reg["np"])
+            new_class.to_torch = staticmethod(lambda: reg["torch"])
             new_class.ctype = staticmethod(
                 lambda: getattr(_ark_core, name.upper())
             )
@@ -104,6 +107,16 @@ def to_numpy() -> numpy.dtype:
         """
         ...
 
+    @staticmethod
+    def to_torch() -> torch.dtype:
+        """
+        Return the corresponding torch data type.
+
+        Returns:
+            torch.dtype: The corresponding torch data type.
+        """
+        ...
+
     @staticmethod
     def ctype() -> _ark_core._DataType:
         """
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
new file mode 100644
index 000000000..e58a3eda8
--- /dev/null
+++ b/python/ark/torch_mock.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+class dtype: ...
+class float32: ...
+class float16: ...
+class bfloat16: ...
+class int32: ...
+class int8: ...
+class uint8: ...
+class ubyte: ...

From ab1998ecef18116bd92f4ea91b14c69becc66655 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 26 May 2024 21:43:10 -0700
Subject: [PATCH 002/106] Update ut-cuda.yml

---
 .github/workflows/ut-cuda.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index e938ca877..5a78818ff 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -7,6 +7,8 @@ on:
   pull_request:
     branches:
       - main
+    types:
+      - ready_for_review
 
 jobs:
   UnitTest:

From ece4f553f62dc2da591321be3f7d5e34bff2c80d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 07:24:41 +0000
Subject: [PATCH 003/106] torch wip

---
 python/ark/data_type.py  |  2 ++
 python/ark/module.py     | 33 ++++++++++++++++++++++++++++-----
 python/ark/tensor.py     | 35 +++++++++++++++++++++++++++++++++++
 python/ark/torch_mock.py | 18 ++++++++++++++++++
 4 files changed, 83 insertions(+), 5 deletions(-)

diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index de64c1d7d..f5ccd9e5b 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -3,6 +3,7 @@
 
 import numpy
 from . import _ark_core
+
 try:
     import torch
 except ImportError:
@@ -19,6 +20,7 @@
     "byte": {"np": numpy.ubyte, "torch": torch.uint8},
 }
 
+
 class MetaDataType(type):
     def __new__(cls, name, bases, attrs):
         new_class = super().__new__(cls, name, bases, attrs)
diff --git a/python/ark/module.py b/python/ark/module.py
index 62b941281..459beeda6 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,9 +3,14 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict
+from typing import Any, Dict, Union
 from .tensor import Parameter
 
+try:
+    import torch
+except ImportError:
+    from . import torch_mock as torch
+
 
 class Module:
     """
@@ -57,7 +62,9 @@ def params_dict(self, prefix="") -> Dict[str, Parameter]:
         return params_dict
 
     def load_state_dict(
-        self, state_dict: Dict[str, np.ndarray], prefix: str = ""
+        self,
+        state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
+        prefix: str = "",
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -68,20 +75,36 @@ def load_state_dict(
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
         for name, param in pd.items():
-            param.from_numpy(state_dict[name])
+            data = state_dict.get(name, None)
+            if isinstance(data, np.ndarray):
+                param.from_numpy(data)
+            elif isinstance(data, torch.Tensor):
+                param.from_torch(data)
+            else:
+                continue
             all_keys.remove(name)
         if all_keys:
             logging.warning(
                 f"{len(all_keys)} unused parameter(s) in state_dict"
             )
 
-    def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]:
+    def state_dict(
+        self, prefix: str = "", mode: str = "numpy"
+    ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
         """
         Copies the parameters from the device GPU to the host and saves the
         model to a state_dict.
         Must be called after the executor is launched.
         """
-        return {k: v.to_numpy() for k, v in self.params_dict(prefix).items()}
+        if mode == "numpy":
+            return {
+                k: v.to_numpy() for k, v in self.params_dict(prefix).items()
+            }
+        elif mode == "torch":
+            return {
+                k: v.to_torch() for k, v in self.params_dict(prefix).items()
+            }
+        raise ValueError(f"Unsupported mode: {mode}")
 
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 316d18566..625f82bce 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -8,6 +8,15 @@
 from .data_type import DataType
 from .runtime import Runtime
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import torch_mock as torch
+
+    _no_torch = True
+
 NullTensor = _NullTensor
 
 
@@ -89,6 +98,32 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
         rt.executor.tensor_write(self._tensor, ndarray)
         return self
 
+    def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
+        """ """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        torch_type = self.dtype().to_torch()
+        if tensor is None:
+            return torch.from_numpy(self.to_numpy())
+        elif tensor.shape != self.shape():
+            raise ValueError("torch tensor shape does not match the tensor")
+        elif tensor.dtype != torch_type:
+            raise ValueError("torch tensor dtype does not match the tensor")
+        elif not tensor.is_contiguous():
+            raise ValueError("torch tensor is not contiguous in memory")
+        elif tensor.numel() != self.nelems():
+            raise ValueError("torch tensor size does not match the tensor")
+        tensor.copy_(torch.from_numpy(self.to_numpy()))
+        return tensor
+
+    def from_torch(self, tensor: torch.Tensor) -> "Tensor":
+        """ """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        if tensor.is_cuda:
+            tensor = tensor.cpu()
+        return self.from_numpy(tensor.numpy())
+
 
 class Parameter(Tensor):
     """
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index e58a3eda8..68333e431 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -1,11 +1,29 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+
 class dtype: ...
+
+
 class float32: ...
+
+
 class float16: ...
+
+
 class bfloat16: ...
+
+
 class int32: ...
+
+
 class int8: ...
+
+
 class uint8: ...
+
+
 class ubyte: ...
+
+
+class Tensor: ...

From 952b7610c31288cc8851aa6466461f2ba7a2393f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:14:40 +0000
Subject: [PATCH 004/106] runtime module

---
 ark/api/planner.cpp                 |  4 +-
 examples/tutorial/torch_tutorial.py | 23 ++++++++
 python/ark/__init__.py              |  2 +-
 python/ark/data_type.py             | 22 +++++++
 python/ark/module.py                | 71 +++++++++++++++++++++-
 python/ark/tensor.py                | 91 ++++++++++++++++++++---------
 6 files changed, 181 insertions(+), 32 deletions(-)
 create mode 100644 examples/tutorial/torch_tutorial.py

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index ad5048c0e..5c9d09f2e 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -56,8 +56,8 @@ static void check_config_field(const ModelOpRef op, const Json &config,
 std::string DefaultPlanner::Impl::plan(bool pretty) const {
     const auto gpu_info = GpuManager::get_instance(gpu_id_)->info();
     size_t num_sm = gpu_info.num_sm;
-    Json task_infos;
-    Json processor_groups;
+    Json task_infos = Json::array();
+    Json processor_groups = Json::array();
     size_t max_num_warps = 1;
     size_t max_num_processors = 1;
     size_t next_node_id = 0;
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
new file mode 100644
index 000000000..5677d41cd
--- /dev/null
+++ b/examples/tutorial/torch_tutorial.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import torch
+
+
+class ArkAddModule(ark.RuntimeModule):
+    def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
+        return ark.add(x, y)
+
+# ARK module for addition
+module = ArkAddModule()
+
+# Define two torch arrays
+x = torch.ones(64) * 2
+y = torch.ones(64) * 3
+
+# Run the ARK module
+z = module(x, y)
+
+# Print the result
+print(z)
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 92e9c39c3..2a4d164e4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -38,7 +38,7 @@ def set_world_size(world_size):
 
 from .init import init
 from .tensor import Dims, Tensor, Parameter
-from .module import Module
+from .module import Module, RuntimeModule
 from .runtime import Runtime, DefaultPlanner
 from .serialize import save, load
 from .data_type import (
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index f5ccd9e5b..8ab982106 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -64,6 +64,28 @@ def from_numpy(np_type: numpy.dtype) -> "DataType":
             f" to ark data type."
         )
 
+    @staticmethod
+    def from_torch(torch_type: torch.dtype) -> "DataType":
+        """
+        Return the corresponding ark data type.
+
+        Parameters:
+            torch_type (torch.dtype): The torch data type.
+
+        Returns:
+            DataType: The corresponding ark data type.
+
+        Raises:
+            ValueError: If there is no defined conversion from torch data type to ark data type.
+        """
+        for type_name, reg in _REGISTRY_DATA_TYPE.items():
+            if reg["torch"] == torch_type:
+                return DataType.from_name(type_name)
+        raise ValueError(
+            f"Undefined conversion from torch data type {torch_type}"
+            f" to ark data type."
+        )
+
     @staticmethod
     def from_name(type_name: str) -> "DataType":
         """
diff --git a/python/ark/module.py b/python/ark/module.py
index 459beeda6..b7919d2cd 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,14 +3,19 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict, Union
-from .tensor import Parameter
+from typing import Any, Dict, List, Union
+from .tensor import Tensor, Parameter
+from .runtime import Runtime, DefaultPlanner
 
 try:
     import torch
+
+    _no_torch = False
 except ImportError:
     from . import torch_mock as torch
 
+    _no_torch = True
+
 
 class Module:
     """
@@ -109,3 +114,65 @@ def state_dict(
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
     def backward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def initialize(self):
+        for param in self.parameters.values():
+            param.initialize()
+        for module in self.sub_modules.values():
+            module.initialize()
+
+
+def _recursive_ark_to_torch(object):
+    if isinstance(object, Tensor):
+        return object.to_torch()
+    if isinstance(object, dict):
+        return {k: _recursive_ark_to_torch(v) for k, v in object.items()}
+    if isinstance(object, list):
+        return [_recursive_ark_to_torch(v) for v in object]
+    return object
+
+
+class RuntimeModule(Module):
+    def __init__(self):
+        if _no_torch:
+            raise ImportError("torch is not available")
+        super().__init__()
+        self.built_forward = False
+        self.built_backward = False
+        self.forward_input_tensor_args: List[Tensor] = []
+        self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
+        self.forward_output = None
+        self.backward_tensor_args = []
+        self.backward_tensor_kwargs = {}
+
+    def build_forward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def build_backward(self, *args: Any, **kwargs: Any) -> Any: ...
+
+    def forward(self, *args: Any, **kwargs: Any) -> Any:
+        if not self.built_forward:
+            for arg in args:
+                if isinstance(arg, torch.Tensor):
+                    self.forward_input_tensor_args.append(
+                        Tensor.from_torch(arg)
+                    )
+            for key, value in kwargs.items():
+                if isinstance(value, torch.Tensor):
+                    self.forward_input_tensor_kwargs[key] = Tensor.from_torch(
+                        value
+                    )
+            self.forward_output = self.build_forward(
+                *self.forward_input_tensor_args,
+                **self.forward_input_tensor_kwargs,
+            )
+            self.built_forward = True
+
+        with Runtime.get_runtime() as rt:
+            rt.launch(plan=DefaultPlanner().plan())
+            for arg in self.forward_input_tensor_args:
+                arg.initialize()
+            for value in self.forward_input_tensor_kwargs.values():
+                value.initialize()
+
+            rt.run()
+            return _recursive_ark_to_torch(self.forward_output)
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 625f82bce..f264bb440 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -2,11 +2,12 @@
 # Licensed under the MIT license.
 
 import numpy as np
-from typing import List
+from typing import Callable, List, Union, Type
 
 from _ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
+from .model import Model
 
 try:
     import torch
@@ -24,14 +25,19 @@ class Dims(_Dims):
     pass
 
 
+Initializer = Type[Callable[[], Union[torch.Tensor, np.ndarray]]]
+
+
 class Tensor:
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
         """
         self._tensor = _tensor
+        self.initializer: Initializer = initializer
+        Model.get_model().add_tensor(self)
 
     def shape(self) -> List[int]:
         """
@@ -80,24 +86,6 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         rt.executor.tensor_read(self._tensor, ndarray)
         return ndarray
 
-    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
-        """
-        Copies the tensor from a host numpy array to the device.
-        """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        ndarray = ndarray.astype(self.dtype().to_numpy())
-        if not ndarray.flags["C_CONTIGUOUS"]:
-            ndarray = np.ascontiguousarray(ndarray)
-        if ndarray.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, ndarray)
-        return self
-
     def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
         """ """
         if _no_torch:
@@ -116,13 +104,62 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
         tensor.copy_(torch.from_numpy(self.to_numpy()))
         return tensor
 
-    def from_torch(self, tensor: torch.Tensor) -> "Tensor":
-        """ """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        if tensor.is_cuda:
-            tensor = tensor.cpu()
-        return self.from_numpy(tensor.numpy())
+    @staticmethod
+    def from_numpy(ndarray: np.ndarray):
+        return Tensor(
+            Model.get_model().tensor(
+                Dims(list(ndarray.shape)),
+                DataType.from_numpy(ndarray.dtype).ctype(),
+                Dims(),
+                Dims(),
+                Dims(),
+                "",
+            ),
+            lambda: ndarray,
+        )
+
+    @staticmethod
+    def from_torch(tensor: torch.Tensor):
+        return Tensor(
+            Model.get_model().tensor(
+                Dims(list(tensor.shape)),
+                DataType.from_torch(tensor.dtype).ctype(),
+                Dims(),
+                Dims(),
+                Dims(),
+                "",
+            ),
+            lambda: tensor,
+        )
+
+    def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
+        """
+        Copies the tensor from a host numpy array to the device.
+        """
+        rt = Runtime.get_runtime()
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        if isinstance(data, torch.Tensor):
+            data = data.cpu().numpy()
+        data = data.astype(self.dtype().to_numpy())
+        if not data.flags["C_CONTIGUOUS"]:
+            data = np.ascontiguousarray(data)
+        if data.nbytes != self.nelems() * self.dtype().element_size():
+            raise ValueError("data size does not match the tensor")
+        rt.executor.tensor_write(self._tensor, data)
+        return self
+
+    def initialize(self) -> "Tensor":
+        """
+        Initializes the tensor.
+        """
+        if self.initializer is not None:
+            data = self.initializer()
+            self.copy(data)
+        return self
 
 
 class Parameter(Tensor):

From a40926812f7b02f02e1e48a981c65e21c4dadfaa Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:20:44 +0000
Subject: [PATCH 005/106] fix

---
 python/ark/tensor.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index f264bb440..5168791a8 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -37,7 +37,6 @@ def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
-        Model.get_model().add_tensor(self)
 
     def shape(self) -> List[int]:
         """

From 8e4622707b34cd4a71579bd65d7ba484e2424969 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 27 May 2024 23:52:16 +0000
Subject: [PATCH 006/106] fix

---
 ark/include/kernels/kernel_template.in |  5 ++++-
 examples/tutorial/torch_tutorial.py    |  6 +++++-
 python/ark/module.py                   | 20 +++++++++++++-------
 3 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index bc842ea4a..5bba320a5 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -59,9 +59,12 @@ void @NAME@(int *_iter) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
       ark_loop_body(_buf, _i);
     }
+    if (threadIdx.x == 0) {
+      __threadfence_system();
+    }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (threadIdx.x == 0 && blockIdx.x == 0) {
       atomicStoreRelaxed(_iter, 0);
     }
-    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
index 5677d41cd..e9482a7cc 100644
--- a/examples/tutorial/torch_tutorial.py
+++ b/examples/tutorial/torch_tutorial.py
@@ -9,6 +9,7 @@ class ArkAddModule(ark.RuntimeModule):
     def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
         return ark.add(x, y)
 
+
 # ARK module for addition
 module = ArkAddModule()
 
@@ -19,5 +20,8 @@ def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
 # Run the ARK module
 z = module(x, y)
 
+w = module(x, z)
+
 # Print the result
-print(z)
+print(z)  # 5
+print(w)  # 7
diff --git a/python/ark/module.py b/python/ark/module.py
index b7919d2cd..a266f522d 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -6,6 +6,8 @@
 from typing import Any, Dict, List, Union
 from .tensor import Tensor, Parameter
 from .runtime import Runtime, DefaultPlanner
+from .ops import tensor
+from .data_type import DataType
 
 try:
     import torch
@@ -154,12 +156,16 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             for arg in args:
                 if isinstance(arg, torch.Tensor):
                     self.forward_input_tensor_args.append(
-                        Tensor.from_torch(arg)
+                        tensor(
+                            list(arg.shape),
+                            DataType.from_torch(arg.dtype),
+                        )
                     )
             for key, value in kwargs.items():
                 if isinstance(value, torch.Tensor):
-                    self.forward_input_tensor_kwargs[key] = Tensor.from_torch(
-                        value
+                    self.forward_input_tensor_kwargs[key] = tensor(
+                        list(value.shape),
+                        DataType.from_torch(value.dtype),
                     )
             self.forward_output = self.build_forward(
                 *self.forward_input_tensor_args,
@@ -169,10 +175,10 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
 
         with Runtime.get_runtime() as rt:
             rt.launch(plan=DefaultPlanner().plan())
-            for arg in self.forward_input_tensor_args:
-                arg.initialize()
-            for value in self.forward_input_tensor_kwargs.values():
-                value.initialize()
+            for tns, arg in zip(self.forward_input_tensor_args, args):
+                tns.copy(arg)
+            for key, value in self.forward_input_tensor_kwargs.items():
+                value.copy(kwargs[key])
 
             rt.run()
             return _recursive_ark_to_torch(self.forward_output)

From eee7ec2b4bb1cde335e99d780657c70e497542c9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 28 May 2024 19:00:09 +0000
Subject: [PATCH 007/106] some fixes

---
 python/ark/module.py   | 23 ++++++++++++++++-------
 python/ark/tensor.py   | 28 +++++++++++++++++++++-------
 python/executor_py.cpp | 15 ++++++++++++++-
 3 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/python/ark/module.py b/python/ark/module.py
index a266f522d..faeeea40d 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -83,12 +83,9 @@ def load_state_dict(
         pd = self.params_dict(prefix)
         for name, param in pd.items():
             data = state_dict.get(name, None)
-            if isinstance(data, np.ndarray):
-                param.from_numpy(data)
-            elif isinstance(data, torch.Tensor):
-                param.from_torch(data)
-            else:
+            if data is None:
                 continue
+            param.copy(data)
             all_keys.remove(name)
         if all_keys:
             logging.warning(
@@ -143,6 +140,8 @@ def __init__(self):
         self.built_backward = False
         self.forward_input_tensor_args: List[Tensor] = []
         self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
+        self.forward_input_args = []
+        self.forward_input_kwargs = {}
         self.forward_output = None
         self.backward_tensor_args = []
         self.backward_tensor_kwargs = {}
@@ -161,15 +160,25 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
                             DataType.from_torch(arg.dtype),
                         )
                     )
+                    self.forward_input_args.append(
+                        self.forward_input_tensor_args[-1]
+                    )
+                else:
+                    self.forward_input_args.append(arg)
             for key, value in kwargs.items():
                 if isinstance(value, torch.Tensor):
                     self.forward_input_tensor_kwargs[key] = tensor(
                         list(value.shape),
                         DataType.from_torch(value.dtype),
                     )
+                    self.forward_input_kwargs[key] = (
+                        self.forward_input_tensor_kwargs[key]
+                    )
+                else:
+                    self.forward_input_kwargs[key] = value
             self.forward_output = self.build_forward(
-                *self.forward_input_tensor_args,
-                **self.forward_input_tensor_kwargs,
+                *self.forward_input_args,
+                **self.forward_input_kwargs,
             )
             self.built_forward = True
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 5168791a8..a567264d5 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -142,13 +142,27 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 "usable only after you call `Runtime.launch()`."
             )
         if isinstance(data, torch.Tensor):
-            data = data.cpu().numpy()
-        data = data.astype(self.dtype().to_numpy())
-        if not data.flags["C_CONTIGUOUS"]:
-            data = np.ascontiguousarray(data)
-        if data.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("data size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, data)
+            if data.dtype != self.dtype().to_torch():
+                raise ValueError("data dtype does not match the tensor")
+            if not data.is_contiguous():
+                data = data.contiguous()
+            if data.numel() != self.nelems():
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(
+                self._tensor,
+                data.data_ptr(),
+                data.numel() * data.element_size(),
+            )
+        elif isinstance(data, np.ndarray):
+            if data.dtype != self.dtype().to_numpy():
+                raise ValueError("data dtype does not match the tensor")
+            if not data.flags["C_CONTIGUOUS"]:
+                data = np.ascontiguousarray(data)
+            if data.nbytes != self.nelems() * self.dtype().element_size():
+                raise ValueError("data size does not match the tensor")
+            rt.executor.tensor_write(self._tensor, data)
+        else:
+            raise ValueError("data must be a numpy array or a torch tensor")
         return self
 
     def initialize(self) -> "Tensor":
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index dc2840329..13a81608e 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -17,6 +17,11 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
                       info.size * info.itemsize);
 }
 
+static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
+                         size_t host_address, size_t bytes) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(host_address), bytes);
+}
+
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                         py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
@@ -39,5 +44,13 @@ void register_executor(py::module &m) {
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
         .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
-        .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data"));
+        .def(
+            "tensor_write",
+            py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
+                &tensor_write),
+            py::arg("tensor"), py::arg("data"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"));
 }

From 87b9b0127de668f810847d04d4c2a08178439ee0 Mon Sep 17 00:00:00 2001
From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com>
Date: Tue, 18 Jun 2024 11:20:45 -0400
Subject: [PATCH 008/106] Python API Multiple Runtime Support (#216)

- Introduced support for multiple Runtime instances
- Added utility functions for multi-runtime management
- Ensured backward compatibility with existing usage patterns of Runtime
- Added unit tests for multi-runtime functionality

---------

Co-authored-by: noli <t-ngerawork@microsoft.com>
---
 ark/api/executor.cpp               | 101 +++++++++++++++++++++
 ark/include/ark/executor.hpp       |   6 ++
 python/ark/init.py                 |   5 +-
 python/ark/ops.py                  | 138 ++++++++++++++++++++++------
 python/ark/runtime.py              | 139 +++++++++++++++++++++++------
 python/ark/tensor.py               |  69 ++++++++++----
 python/executor_py.cpp             |  30 ++++++-
 python/unittest/test.py            |   1 +
 python/unittest/test_conversion.py |  93 +++++++++++++++++++
 python/unittest/test_runtime.py    | 121 ++++++++++++++++++++++---
 10 files changed, 610 insertions(+), 93 deletions(-)
 create mode 100644 python/unittest/test_conversion.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 198d22e51..a0711bfe8 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -3,12 +3,15 @@
 
 #include "ark/executor.hpp"
 
+#include <dlpack/dlpack.h>
+
 #include <cmath>
 #include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
 
+#include "ark/data_type.hpp"
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
 #include "codegen.hpp"
@@ -154,6 +157,8 @@ class Executor::Impl {
     void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
     void tensor_write(const Tensor tensor, const void *data,
                       size_t bytes) const;
+    DLDeviceType get_device_type() const;
+    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
    private:
     void init_communicator();
@@ -783,6 +788,94 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     copy_stream_->sync();
 }
 
+DLDeviceType Executor::Impl::get_device_type() const {
+#if defined(ARK_CUDA)
+    return kDLCUDA;
+#elif defined(ARK_ROCM)
+    return kDLROCM;
+#else
+    return kDLCPU;
+#endif
+}
+
+DLDataType get_dl_dtype(const DataType &ark_data_type) {
+    DLDataType dl_data_type;
+    dl_data_type.lanes = 1;
+    if (ark_data_type == FP32) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == FP16) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == BF16) {
+        dl_data_type.code = kDLBfloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == INT32) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == UINT32) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == INT8) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == UINT8) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == BYTE) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else {
+        ERR(InvalidUsageError, "Unsupported data type");
+    }
+    return dl_data_type;
+}
+
+DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const {
+    DLTensor dl_tensor;
+    dl_tensor.data =
+        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    size_t offset_in_elements =
+        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
+    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = static_cast<int32_t>(gpu_id_);
+    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
+    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
+
+    dl_tensor.shape =
+        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    dl_tensor.strides =
+        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    auto shape = tensor.shape();
+    if (dl_tensor.shape) {
+        for (int i = 0; i < dl_tensor.ndim; ++i) {
+            dl_tensor.shape[i] = shape[i];
+        }
+    }
+    if (dl_tensor.strides) {
+        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
+        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
+            dl_tensor.strides[i] =
+                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+        }
+    }
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = dl_tensor;
+    dl_managed_tensor->manager_ctx = nullptr;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->dl_tensor.shape) {
+            delete[] self->dl_tensor.shape;
+            self->dl_tensor.shape = nullptr;
+        }
+        if (self->dl_tensor.strides) {
+            delete[] self->dl_tensor.strides;
+            self->dl_tensor.strides = nullptr;
+        }
+    };
+    return dl_managed_tensor;
+}
+
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
     : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
@@ -818,6 +911,14 @@ void Executor::tensor_write(const Tensor tensor, const void *data,
     impl_->tensor_write(tensor, data, bytes);
 }
 
+DLDeviceType Executor::get_device_type() const {
+    return impl_->get_device_type();
+}
+
+DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const {
+    return impl_->get_dl_tensor(tensor);
+}
+
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
                                  const std::string &name)
     : Executor(
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 4682af7d0..54c49cd29 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -4,6 +4,8 @@
 #ifndef ARK_EXECUTOR_HPP
 #define ARK_EXECUTOR_HPP
 
+#include <dlpack/dlpack.h>
+
 #include <ark/model_ref.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
@@ -62,6 +64,10 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data,
                       size_t bytes) const;
 
+    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
+
+    DLDeviceType get_device_type() const;
+
    private:
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/python/ark/init.py b/python/ark/init.py
index be71e8e02..dbf7c1569 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -9,7 +9,6 @@
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.executor is not None:
-        if not _RuntimeState.executor.destroyed():
-            _RuntimeState.executor.destroy()
+    if _RuntimeState.runtime:
+        _RuntimeState.delete_all()
     _ark_core.init()
diff --git a/python/ark/ops.py b/python/ark/ops.py
index bc1c3ed13..86b021aef 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -59,6 +59,8 @@ def add(
     tensor_add = ark.add(tensor1, tensor2)
     """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         a = input._tensor
         b = other._tensor
     elif isinstance(input, Tensor):
@@ -75,7 +77,9 @@ def add(
         )
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().add(a, b, output, name))
+    return Tensor(
+        Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id
+    )
 
 
 def cast(
@@ -88,7 +92,8 @@ def cast(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name)
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -97,10 +102,12 @@ def constant(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "constant",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Constant."""
     return Tensor(
-        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
@@ -112,7 +119,10 @@ def copy(
         output = output._tensor
     if isinstance(input, Tensor):
         intput = intput._tensor
-    return Tensor(Model.get_model().copy(intput, output, name))
+    return Tensor(
+        Model.get_model().copy(intput, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def div(
@@ -130,8 +140,13 @@ def div(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().div(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().div(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def embedding(
@@ -141,10 +156,15 @@ def embedding(
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
+    if input.runtime_id != weight.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().embedding(input._tensor, weight._tensor, output, name)
+        Model.get_model().embedding(
+            input._tensor, weight._tensor, output, name
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -158,7 +178,10 @@ def exp(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().exp(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().exp(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def gelu(
@@ -174,7 +197,10 @@ def gelu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().gelu(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().gelu(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def identity(
@@ -189,8 +215,13 @@ def identity(
     for dep in deps:
         if not isinstance(dep, Tensor):
             raise TypeError("All dependencies should be a tensor")
+        if input.runtime_id != dep.runtime_id:
+            raise ValueError("All tensors must be on the same runtime")
         dep_tensors.append(dep._tensor)
-    return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name))
+    return Tensor(
+        Model.get_model().identity(input._tensor, dep_tensors, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def matmul(
@@ -210,6 +241,8 @@ def matmul(
     Usage:
     tensor_matmul = ark.matmul(tensor1, tensor2)
     """
+    if input.runtime_id != other.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -220,7 +253,8 @@ def matmul(
             transpose_input,
             transpose_other,
             name,
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -239,8 +273,13 @@ def mul(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().mul(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().mul(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def noop(input: Tensor, name: str = "noop"):
@@ -268,7 +307,8 @@ def reduce_max(
     return Tensor(
         Model.get_model().reduce_max(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -290,7 +330,8 @@ def reduce_mean(
     return Tensor(
         Model.get_model().reduce_mean(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -314,7 +355,8 @@ def reduce_sum(
     return Tensor(
         Model.get_model().reduce_sum(
             input._tensor, axis, keepdims, output, name
-        )
+        ),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -329,7 +371,10 @@ def relu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().relu(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().relu(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def reshape(
@@ -357,7 +402,8 @@ def reshape(
     if len(shape) > 4:
         raise ValueError("Only support tensors with up to 4 dimensions")
     return Tensor(
-        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name)
+        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -374,8 +420,11 @@ def rope(
     """
     if output is not NullTensor:
         output = output._tensor
+    if input.runtime_id != other.runtime_id:
+        raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name)
+        Model.get_model().rope(input._tensor, other._tensor, output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -389,7 +438,10 @@ def rsqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().rsqrt(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().rsqrt(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sharding(
@@ -407,7 +459,9 @@ def sharding(
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
     )
-    return [Tensor(_tensor) for _tensor in _tensor_list]
+    return [
+        Tensor(_tensor, runtime_id=input.runtime_id) for _tensor in _tensor_list
+    ]
 
 
 def sigmoid(
@@ -421,7 +475,10 @@ def sigmoid(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().sigmoid(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().sigmoid(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sqrt(
@@ -434,7 +491,10 @@ def sqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(Model.get_model().sqrt(input._tensor, output, name))
+    return Tensor(
+        Model.get_model().sqrt(input._tensor, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def sub(
@@ -452,8 +512,13 @@ def sub(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
+        if input.runtime_id != other.runtime_id:
+            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(Model.get_model().sub(input._tensor, other, output, name))
+    return Tensor(
+        Model.get_model().sub(input._tensor, other, output, name),
+        runtime_id=input.runtime_id,
+    )
 
 
 def tensor(
@@ -463,6 +528,7 @@ def tensor(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
+    runtime_id: int = -1,
 ) -> Tensor:
     """
     Construct a tensor with given shape and data type.
@@ -470,7 +536,10 @@ def tensor(
     tensor = ark.tensor([1, 2, 3, 4], dtype=ark.fp32)
     tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
-    return Tensor(_tensor(shape, dtype, strides, offsets, padded_shape, name))
+    return Tensor(
+        _tensor(shape, dtype, strides, offsets, padded_shape, name),
+        runtime_id=runtime_id,
+    )
 
 
 def transpose(
@@ -496,7 +565,8 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name)
+        Model.get_model().transpose(input._tensor, perm, output, name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -515,11 +585,15 @@ def mean(
 
 
 def ones(
-    shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    name: str = "ones",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Ones."""
     return Tensor(
-        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
@@ -530,12 +604,14 @@ def parameter(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
+    runtime_id: int = -1,
 ) -> Parameter:
     """
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name)
+        _tensor(shape, dtype, strides, offsets, padded_shape, name),
+        runtime_id=runtime_id,
     )
 
 
@@ -569,11 +645,15 @@ def layernorm(
 
 
 def zeros(
-    shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros"
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    name: str = "zeros",
+    runtime_id: int = -1,
 ) -> Tensor:
     """Zeros."""
     return Tensor(
-        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
+        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name),
+        runtime_id=runtime_id,
     )
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 7480ce7da..798eaf9d5 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,7 +3,7 @@
 
 import logging
 from enum import Enum
-from typing import Callable
+from typing import Callable, Dict, List
 
 from _ark_core import _Executor, _DefaultPlanner
 from .model import Model
@@ -14,8 +14,36 @@ class _RuntimeState:
     The _RuntimeState class is used to store the state of the model.
     """
 
-    runtime = None
-    executor = None
+    runtime: Dict[int, "Runtime"] = {}
+
+    @staticmethod
+    def reset_all():
+        """
+        Resets all runtimes.
+        """
+        runtime_ids = list(_RuntimeState.runtime.keys())
+        for runtime_id in runtime_ids:
+            _RuntimeState.runtime[runtime_id].reset()
+
+    @staticmethod
+    def delete_all():
+        """
+        Deletes all runtimes.
+        """
+        runtime_ids = list(_RuntimeState.runtime.keys())
+        for runtime_id in runtime_ids:
+            _RuntimeState.runtime[runtime_id].reset(delete=True)
+
+    @staticmethod
+    def print_runtime_states():
+        """
+        Print runtimes and their corresponding states.
+        """
+        print(f"{'Runtime ID':<12} | {'Status':<20}")
+        print(f"{'-'*12} | {'-'*20}")
+        for runtime_id, runtime in _RuntimeState.runtime.items():
+            runtime_id = "-1(Default)" if runtime_id == -1 else runtime_id
+            print(f"{runtime_id:<12} | {runtime.state:<20}")
 
 
 class DefaultPlanner(_DefaultPlanner):
@@ -61,22 +89,48 @@ class State(Enum):
         LaunchedNotRunning = 1
         Running = 2
 
+    def __init__(self, runtime_id: int = -1):
+        self.runtime_id = runtime_id
+        self.executor: Executor = None
+        self.state: Runtime.State = Runtime.State.Init
+        _RuntimeState.runtime[runtime_id] = self
+
+    def get_state(self) -> "Runtime.State":
+        """
+        Get the runtime state.
+        """
+        return self.state
+
     @staticmethod
-    def get_runtime() -> "Runtime":
+    def exists(runtime_id: int) -> bool:
         """
-        Get the runtime.
+        Check if a runtime exists with the given ID.
         """
-        if _RuntimeState.runtime is None:
-            _RuntimeState.runtime = Runtime()
-        return _RuntimeState.runtime
+        return runtime_id in _RuntimeState.runtime
 
-    def __init__(self):
-        self.executor: Executor = None
-        self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime = self
+    @staticmethod
+    def get_all_ids() -> List[int]:
+        """
+        Get a list of all existing runtime IDs.
+        """
+        return list(_RuntimeState.runtime.keys())
 
-    def __del__(self):
-        self.reset()
+    @staticmethod
+    def get_runtime(runtime_id=-1) -> "Runtime":
+        """
+        Get the runtime by ID. If runtime_id is not provided, use a default ID of -1.
+        If the runtime does not exist, create a new runtime with the given ID.
+        """
+        if runtime_id not in _RuntimeState.runtime:
+            _RuntimeState.runtime[runtime_id] = Runtime(runtime_id)
+        return _RuntimeState.runtime[runtime_id]
+
+    @staticmethod
+    def see_runtime_statuses() -> "Dict[int, Runtime]":
+        """
+        Returns the runtime dictionary containing all of the runtimes.
+        """
+        return _RuntimeState.runtime
 
     def __enter__(self):
         return self
@@ -113,7 +167,9 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warn("Runtime is already launched, skip launching")
+            logging.warn(
+                f"Runtime {self.runtime_id} is already launched, skip launching"
+            )
             return
         if not plan:
             if not plan_path:
@@ -124,19 +180,19 @@ def launch(
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
-            if _RuntimeState.executor is not None:
-                if not _RuntimeState.executor.destroyed():
-                    logging.warn("Destroying an old executor")
-                    _RuntimeState.executor.destroy()
-
-            _RuntimeState.executor = Executor(
+            if self.executor is not None:
+                if not self.executor.destroyed():
+                    logging.warn(
+                        f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
+                    )
+                    self.executor.destroy()
+            self.executor = Executor(
                 rank,
                 world_size,
                 gpu_id,
                 "ArkRuntime",
                 plan,
             )
-            self.executor = _RuntimeState.executor
             self.executor.compile()
         self.executor.launch()
         self.state = Runtime.State.LaunchedNotRunning
@@ -146,8 +202,8 @@ def run(self, iter=1, non_blocking=False):
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
         if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error("ARK runtime is not launched")
-            raise RuntimeError("ARK runtime is not launched")
+            logging.error(f"ARK runtime {self.runtime_id} is not launched")
+            raise RuntimeError(f"ARK runtime {self.runtime_id} is not launched")
         self.state = Runtime.State.Running
         self.executor.run(iter)
         if not non_blocking:
@@ -158,7 +214,9 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warn("ARK runtime is not running, skip waiting")
+            logging.warn(
+                f"ARK runtime {self.runtime_id} is not running, skip waiting"
+            )
             return
         self.executor.wait()
         self.state = Runtime.State.LaunchedNotRunning
@@ -169,15 +227,17 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warn("ARK runtime is never launched, skip stopping")
+            logging.warn(
+                f"ARK runtime {self.runtime_id} is never launched, skip stopping"
+            )
             return
         elapsed = self.executor.stop()
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self):
+    def reset(self, delete=False):
         """
-        Reset the runtime.
+        Reset the runtime. If delete is True, delete the runtime associated with the runtime_id.
         """
         if self.launched():
             self.stop()
@@ -186,3 +246,26 @@ def reset(self):
                 self.executor.destroy()
             self.executor = None
         self.state = Runtime.State.Init
+        if delete:
+            del _RuntimeState.runtime[self.runtime_id]
+
+    @staticmethod
+    def reset_all_runtimes():
+        """
+        Reset all runtimes.
+        """
+        _RuntimeState.reset_all()
+
+    @staticmethod
+    def delete_all_runtimes():
+        """
+        Delete all runtimes.
+        """
+        _RuntimeState.delete_all()
+
+    @staticmethod
+    def print_runtime_states():
+        """
+        Print runtimes and their corresponding states.
+        """
+        _RuntimeState.print_runtime_states()
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index a567264d5..00e266929 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -29,14 +29,22 @@ class Dims(_Dims):
 
 
 class Tensor:
-    def __init__(self, _tensor: _Tensor, initializer: Initializer = None):
+    def __init__(
+        self,
+        _tensor: _Tensor,
+        initializer: Initializer = None,
+        runtime_id: int = -1,
+    ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            intializer (Initializer): The initializer for the Tensor.
+            runtime_id (int): The ID of the Runtime to use. Defaults to -1, which is the default Runtime.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
+        self.runtime_id = runtime_id
 
     def shape(self) -> List[int]:
         """
@@ -69,7 +77,7 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         an empty numpy array without the data buffer will be returned.
         """
         np_type = self.dtype().to_numpy()
-        rt = Runtime.get_runtime()
+        rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
             return np.ndarray(self.shape(), dtype=np_type, buffer=None)
         if ndarray is None:
@@ -85,7 +93,9 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         rt.executor.tensor_read(self._tensor, ndarray)
         return ndarray
 
-    def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
+    def to_torch(
+        self, tensor: torch.Tensor = None, runtime_id: int = -1
+    ) -> torch.Tensor:
         """ """
         if _no_torch:
             raise ImportError("torch is not available")
@@ -100,22 +110,42 @@ def to_torch(self, tensor: torch.Tensor = None) -> torch.Tensor:
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
             raise ValueError("torch tensor size does not match the tensor")
-        tensor.copy_(torch.from_numpy(self.to_numpy()))
+        tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id)))
         return tensor
 
-    @staticmethod
-    def from_numpy(ndarray: np.ndarray):
-        return Tensor(
-            Model.get_model().tensor(
-                Dims(list(ndarray.shape)),
-                DataType.from_numpy(ndarray.dtype).ctype(),
-                Dims(),
-                Dims(),
-                Dims(),
-                "",
-            ),
-            lambda: ndarray,
-        )
+    def get_torch_view(self) -> torch.Tensor:
+        """
+        Returns a torch tensor that shares the same memory with the device tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.get_torch_view()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        dl_tensor = rt.executor.get_dl_tensor(self._tensor)
+        torch_view = torch.utils.dlpack.from_dlpack(dl_tensor)
+        return torch_view
+
+    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
+        """
+        Copies the tensor from a host numpy array to the device.
+        """
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        ndarray = ndarray.astype(self.dtype().to_numpy())
+        if not ndarray.flags["C_CONTIGUOUS"]:
+            ndarray = np.ascontiguousarray(ndarray)
+        if ndarray.nbytes != self.nelems() * self.dtype().element_size():
+            raise ValueError("ndarray size does not match the tensor")
+        rt.executor.tensor_write(self._tensor, ndarray)
+        return self
 
     @staticmethod
     def from_torch(tensor: torch.Tensor):
@@ -135,7 +165,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
-        rt = Runtime.get_runtime()
+        rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -180,8 +210,9 @@ class Parameter(Tensor):
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: _Tensor, runtime_id: int = -1):
         """
         Initializes a new instance of the Parameter class.
         """
         super().__init__(_tensor)
+        self.runtime_id = runtime_id
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 13a81608e..59bee5a9b 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
-
+#include <iostream>
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -29,6 +30,29 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                      info.size * info.itemsize);
 }
 
+DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
+    DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor);
+    return dl_tensor;
+}
+
+void free_capsule(PyObject *capsule) {
+    const char *name = PyCapsule_GetName(capsule);
+    auto *dl_managed_tensor =
+        static_cast<DLManagedTensor *>(PyCapsule_GetPointer(capsule, name));
+    if (dl_managed_tensor) {
+        dl_managed_tensor->deleter(dl_managed_tensor);
+        dl_managed_tensor = nullptr;
+    }
+}
+
+py::capsule to_dlpack_capsule(ark::Executor &self, const ark::Tensor &tensor) {
+    DLManagedTensor *dl_managed_tensor = to_dlpack(self, tensor);
+    const char *capsule_name = "dltensor";
+    PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
+                                         capsule_name, free_capsule);
+    return py::reinterpret_steal<py::capsule>(dl_capsule);
+}
+
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
         .def(
@@ -52,5 +76,7 @@ void register_executor(py::module &m) {
         .def("tensor_write",
              py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
                                size_t>(&tensor_write),
-             py::arg("tensor"), py::arg("address"), py::arg("bytes"));
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"))
+        .def("get_dl_tensor", &to_dlpack_capsule),
+        py::arg("tensor");
 }
diff --git a/python/unittest/test.py b/python/unittest/test.py
index f6f9b97af..e43ff11e2 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -9,3 +9,4 @@
 
 from test_model import *
 from test_runtime import *
+from test_conversion import *
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
new file mode 100644
index 000000000..8f941a033
--- /dev/null
+++ b/python/unittest/test_conversion.py
@@ -0,0 +1,93 @@
+import torch
+import numpy as np
+import ark
+
+
+def initialize_tensor(dimensions, dtype):
+    tensor = ark.tensor(dimensions, dtype)
+    tensor_host = np.random.rand(*dimensions).astype(dtype.to_numpy())
+    return tensor, tensor_host
+
+
+# Test function to validate the integrity of the PyTorch view of the ARK tensor,
+# including its data and attributes such as shape and data type.
+def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
+    ark.init()
+    dimensions = [size] * num_dims
+
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.add(input_tensor, other_tensor)
+
+    runtime = ark.Runtime()
+    runtime.launch()
+
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.get_torch_view()
+    other_view = other_tensor.get_torch_view()
+    output_view = output_tensor.get_torch_view()
+
+    runtime.run()
+
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+
+    output_tensor_host = output_tensor.to_numpy()
+
+    runtime.stop()
+    runtime.delete_all_runtimes()
+
+    assert np.allclose(input_tensor_host, input_view_numpy)
+    assert np.allclose(other_tensor_host, other_view_numpy)
+    assert np.allclose(output_tensor_host, output_view_numpy)
+
+
+# Function to check if there is a difference between two arrays at a specific index
+def check_diff(input_tensor_host, input_view_numpy, value, index):
+    mask = np.ones(input_tensor_host.shape, dtype=bool)
+    mask[index] = False
+    if not np.allclose(input_tensor_host[mask], input_view_numpy[mask]):
+        print("Difference found at index: ", index)
+        return False
+    if input_view_numpy[index] != value:
+        print(input_view_numpy[index], value)
+        return False
+    return True
+
+
+# Test function to check if changes to the torch views are reflected in the original tensors
+def test_aliasing(dtype: ark.DataType):
+    ark.init()
+    dimensions = [4, 4]
+    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
+    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
+    output_tensor = ark.mul(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor.from_numpy(other_tensor_host)
+
+    input_view = input_tensor.get_torch_view()
+    other_view = other_tensor.get_torch_view()
+    output_view = output_tensor.get_torch_view()
+    # make changes to the views
+    input_view[1, 1] = 20
+    other_view[0, 0] = 30
+    runtime.run()
+    output_view[3, 0] = 40
+
+    output_tensor_host = output_tensor.to_numpy()
+    input_view_numpy = input_view.cpu().numpy()
+    other_view_numpy = other_view.cpu().numpy()
+    output_view_numpy = output_view.cpu().numpy()
+    # Check if changes to the views are reflected in the original tensors
+    print(input_view_numpy)
+    assert check_diff(input_tensor_host, input_view_numpy, 20, (1, 1))
+    assert check_diff(other_tensor_host, other_view_numpy, 30, (0, 0))
+    assert check_diff(output_tensor_host, output_view_numpy, 40, (3, 0))
+
+    runtime.stop()
+    runtime.reset()
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index bd9098fe8..fd34bb96b 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -4,21 +4,20 @@
 import ark
 import json
 
+empty_plan = json.dumps(
+    {
+        "Rank": 0,
+        "WorldSize": 1,
+        "NumProcessors": 1,
+        "NumWarpsPerProcessor": 1,
+        "TaskInfos": [],
+        "ProcessorGroups": [],
+    }
+)
+
 
 def test_runtime_relaunch():
     ark.init()
-
-    empty_plan = json.dumps(
-        {
-            "Rank": 0,
-            "WorldSize": 1,
-            "NumProcessors": 1,
-            "NumWarpsPerProcessor": 1,
-            "TaskInfos": [],
-            "ProcessorGroups": [],
-        }
-    )
-
     with ark.Runtime.get_runtime() as rt:
         assert rt.launched() == False
         rt.launch(plan=empty_plan)
@@ -28,3 +27,101 @@ def test_runtime_relaunch():
         assert rt.launched() == False
         rt.launch(plan=empty_plan)
         assert rt.launched() == True
+
+
+def test_multiple_runtime_launch():
+    ark.init()
+    num_runtimes = 5
+    for i in range(num_runtimes):
+        rt = ark.Runtime.get_runtime(i)
+        assert rt.launched() == False
+        rt.launch(gpu_id=i, plan=empty_plan)
+        assert rt.launched() == True
+    for i in range(num_runtimes):
+        rt = ark.Runtime.get_runtime(i)
+        assert rt.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_stop_runtime():
+    ark.init()
+    rt1 = ark.Runtime.get_runtime(1)
+    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt2 = ark.Runtime.get_runtime(2)
+    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt1.stop()
+    rt1.reset()
+    assert rt1.state == ark.Runtime.State.Init
+    assert rt2.state == ark.Runtime.State.LaunchedNotRunning
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_reset_runtime():
+    ark.init()
+    rt1 = ark.Runtime.get_runtime(0)
+    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt2 = ark.Runtime.get_runtime(1)
+    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt1.reset()
+    assert rt1.launched() == False
+    assert rt2.launched() == True
+    rt1.launch(plan=empty_plan)
+    assert rt1.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_multiple_runtimes_complex():
+    ark.init()
+    num_runtimes = 3
+    runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+    default_runtime = ark.Runtime.get_runtime()
+    runtime_list.append(default_runtime)
+    for i, rt in enumerate(runtime_list):
+        rt.launch(plan=empty_plan, gpu_id=i)
+        assert rt.launched() == True
+    runtime_list[0].stop()
+    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+    for rt in runtime_list[1:]:
+        assert rt.launched() == True
+    runtime_list[1].reset()
+    assert runtime_list[1].state == ark.Runtime.State.Init
+    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+    assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
+    runtime_list[1].launch(plan=empty_plan, gpu_id=1)
+    for rt in runtime_list:
+        assert rt.launched() == True
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_runtime_state_after_reset():
+    ark.init()
+    rt = ark.Runtime.get_runtime()
+    rt.launch(plan=empty_plan)
+    rt.reset()
+    assert rt.launched() == False
+    assert rt.running() == False
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_see_runtime_statuses():
+    ark.init()
+    num_runtimes = 3
+    runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+    runtime_statuses = ark.Runtime.see_runtime_statuses()
+    assert len(runtime_statuses) == num_runtimes
+    for i in range(num_runtimes):
+        assert i in runtime_statuses
+    for i, rt in enumerate(runtimes):
+        assert runtime_statuses[i] == rt
+    ark.Runtime.delete_all_runtimes()
+
+
+def test_multiple_runtimes_init():
+    ark.init()
+    runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
+    for rt in runtimes:
+        assert rt.state == ark.Runtime.State.Init
+    ark.init()
+    runtimes = ark.Runtime.see_runtime_statuses()
+    assert len(runtimes) == 0
+    ark.Runtime.delete_all_runtimes()

From 9a0556bde84a4dd6a76f39155d60957c9165ad52 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 18 Jun 2024 21:30:02 +0000
Subject: [PATCH 009/106] cmake dlpack

---
 .gitmodules                |  4 ++++
 ark/CMakeLists.txt         |  1 +
 third_party/CMakeLists.txt | 13 +++++++++++++
 third_party/dlpack         |  1 +
 4 files changed, 19 insertions(+)
 create mode 160000 third_party/dlpack

diff --git a/.gitmodules b/.gitmodules
index ced5dcf94..ec484eb61 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -17,3 +17,7 @@
 [submodule "third_party/json"]
 	path = third_party/json
 	url = https://github.com/nlohmann/json
+
+[submodule "third_party/dlpack"]
+	path = third_party/dlpack
+	url = https://github.com/dmlc/dlpack
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 4457d3c0b..ce03b65ed 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -17,6 +17,7 @@ set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt)
 target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_include_directories(ark_obj PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
 target_include_directories(ark_obj SYSTEM PRIVATE
+    ${DLPACK_INCLUDE_DIRS}
     ${JSON_INCLUDE_DIRS}
     ${MSCCLPP_INCLUDE_DIRS}
     ${IBVERBS_INCLUDE_DIRS}
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 75916d962..cc4b5eb5c 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -35,6 +35,19 @@ if (NOT json_POPULATED)
 endif()
 set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE)
 
+# DLPack
+FetchContent_Declare(
+    dlpack
+    GIT_REPOSITORY https://github.com/dmlc/dlpack
+    GIT_TAG v0.8
+    SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/dlpack
+)
+FetchContent_GetProperties(dlpack)
+if (NOT dlpack_POPULATED)
+    FetchContent_Populate(dlpack)
+endif()
+set(DLPACK_INCLUDE_DIRS ${dlpack_SOURCE_DIR}/include PARENT_SCOPE)
+
 if(USE_CUDA)
     # Configure CUTLASS
     FetchContent_Declare(
diff --git a/third_party/dlpack b/third_party/dlpack
new file mode 160000
index 000000000..365b823ce
--- /dev/null
+++ b/third_party/dlpack
@@ -0,0 +1 @@
+Subproject commit 365b823cedb281cd0240ca601aba9b78771f91a3

From 75f7831b700783e899beaa15f950f125a7520d6c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 18 Jun 2024 22:38:35 +0000
Subject: [PATCH 010/106] include dlpack for pybind

---
 python/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index efb9aea3e..bd25d01e6 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,3 +20,4 @@ file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.c
 pybind11_add_module(ark_py ${BIND_SOURCES})
 set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
 target_link_libraries(ark_py PRIVATE ark_static)
+target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})

From 94b44f20a15c892d5a47e1597d838891ca600553 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 24 Jun 2024 23:51:22 +0000
Subject: [PATCH 011/106] support d2d copy

---
 ark/api/executor.cpp               | 99 ++++++++++++++++++++----------
 ark/include/ark/executor.hpp       | 10 ++-
 python/ark/tensor.py               | 42 +++++++++----
 python/executor_py.cpp             | 33 +++++++---
 python/unittest/test_conversion.py | 37 ++++++++++-
 5 files changed, 162 insertions(+), 59 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index a0711bfe8..96e53c8cf 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -147,6 +147,8 @@ class Executor::Impl {
          const std::string &plan);
     ~Impl() = default;
 
+    int gpu_id() const { return gpu_id_; }
+
     void compile();
     void launch(int64_t max_spin_count);
     void run(int iter);
@@ -154,9 +156,10 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     bool is_d2d) const;
     void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+                      size_t bytes, bool is_d2d) const;
     DLDeviceType get_device_type() const;
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
@@ -731,57 +734,83 @@ void Executor::Impl::barrier() {
 }
 
 void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes) const {
+                                 size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Destination bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t tensor_bytes =
-        tensor.strides().nelems() * tensor.data_type().bytes();
-    void *src =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
+    void *src = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost,
-                            copy_stream_->get()));
-        copy_stream_->sync();
+        GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get()));
     } else {
+        size_t tensor_bytes =
+            tensor.strides().nelems() * tensor.data_type().bytes();
         std::vector<int8_t> tensor_host(tensor_bytes);
         GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes,
                             gpuMemcpyDeviceToHost, copy_stream_->get()));
         copy_stream_->sync();
-        tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
+                    tensor.shape(), tensor.strides(), tensor.offsets(),
+                    tensor.data_type().bytes());
+            return;
+        }
+        // TODO: convert data layout on the device directly
+        std::vector<int8_t> data_host(bytes);
+        tensor_to_data(tensor_host.data(), data_host.data(),
+                tensor.shape(), tensor.strides(), tensor.offsets(),
+                tensor.data_type().bytes());
+        GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
+                            gpuMemcpyHostToDevice, copy_stream_->get()));
     }
+    copy_stream_->sync();
 }
 
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
-                                  size_t bytes) const {
+                                  size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Source bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
+    }
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
     }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
-    void *dst =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
+    void *dst = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice,
-                            copy_stream_->get()));
+        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
-        GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
+                        tensor.shape(), tensor.strides(), tensor.offsets(),
+                        tensor.data_type().bytes());
+        } else {
+            // TODO: convert data layout on the device directly
+            std::vector<int8_t> tmp(bytes);
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes,
+                                gpuMemcpyDeviceToHost, copy_stream_->get()));
+            copy_stream_->sync();
+            data_to_tensor(tensor_host.data(), tmp.data(),
+                        tensor.shape(), tensor.strides(), tensor.offsets(),
+                        tensor.data_type().bytes());
+        }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
     }
@@ -883,6 +912,8 @@ Executor::Executor(int rank, int world_size, int gpu_id,
 
 Executor::~Executor() = default;
 
+int Executor::gpu_id() const { return impl_->gpu_id(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -902,13 +933,13 @@ void Executor::destroy() { impl_.reset(nullptr); }
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
 void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes) const {
-    impl_->tensor_read(tensor, data, bytes);
+                           size_t bytes, bool is_d2d) const {
+    impl_->tensor_read(tensor, data, bytes, is_d2d);
 }
 
 void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes) const {
-    impl_->tensor_write(tensor, data, bytes);
+                            size_t bytes, bool is_d2d) const {
+    impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 
 DLDeviceType Executor::get_device_type() const {
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 54c49cd29..a5d6f0273 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -23,6 +23,9 @@ class Executor {
 
     ~Executor();
 
+    /// Return the GPU ID.
+    int gpu_id() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -59,10 +62,11 @@ class Executor {
                      data.size() * sizeof(T));
     }
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     bool is_d2d = false) const;
 
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      bool is_d2d = false) const;
 
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 00e266929..eff1bf20e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -77,10 +77,17 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
         an empty numpy array without the data buffer will be returned.
         """
         np_type = self.dtype().to_numpy()
+        if np_type is None:
+            raise ValueError(
+                f"Tensor data type {self.dtype().__name__} is not supported by numpy."
+            )
         rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
-            return np.ndarray(self.shape(), dtype=np_type, buffer=None)
-        if ndarray is None:
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_numpy()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        elif ndarray is None:
             ndarray = np.zeros(self.shape(), dtype=np_type)
         elif not ndarray.flags["C_CONTIGUOUS"]:
             raise ValueError("ndarray is not contiguous in memory")
@@ -99,9 +106,18 @@ def to_torch(
         """ """
         if _no_torch:
             raise ImportError("torch is not available")
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_torch()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
         torch_type = self.dtype().to_torch()
         if tensor is None:
-            return torch.from_numpy(self.to_numpy())
+            dev_name = f"cuda:{rt.executor.gpu_id()}"
+            tensor = torch.zeros(
+                self.shape(), dtype=torch_type, device=torch.device(dev_name)
+            )
         elif tensor.shape != self.shape():
             raise ValueError("torch tensor shape does not match the tensor")
         elif tensor.dtype != torch_type:
@@ -110,7 +126,10 @@ def to_torch(
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
             raise ValueError("torch tensor size does not match the tensor")
-        tensor.copy_(torch.from_numpy(self.to_numpy(self.runtime_id)))
+        tensor_bytes = self.nelems() * self.dtype().element_size()
+        rt.executor.tensor_read(
+            self._tensor, tensor.data_ptr(), tensor_bytes, True
+        )
         return tensor
 
     def get_torch_view(self) -> torch.Tensor:
@@ -163,7 +182,8 @@ def from_torch(tensor: torch.Tensor):
 
     def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
-        Copies the tensor from a host numpy array to the device.
+        Copies data into this tensor. The data type may differ,
+        but the size must match.
         """
         rt = Runtime.get_runtime(self.runtime_id)
         if not rt.launched():
@@ -171,24 +191,22 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
                 "usable only after you call `Runtime.launch()`."
             )
+        tensor_bytes = self.nelems() * self.dtype().element_size()
         if isinstance(data, torch.Tensor):
-            if data.dtype != self.dtype().to_torch():
-                raise ValueError("data dtype does not match the tensor")
             if not data.is_contiguous():
                 data = data.contiguous()
-            if data.numel() != self.nelems():
+            if data.numel() * data.element_size() != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
             rt.executor.tensor_write(
                 self._tensor,
                 data.data_ptr(),
-                data.numel() * data.element_size(),
+                tensor_bytes,
+                data.device.type == "cuda",
             )
         elif isinstance(data, np.ndarray):
-            if data.dtype != self.dtype().to_numpy():
-                raise ValueError("data dtype does not match the tensor")
             if not data.flags["C_CONTIGUOUS"]:
                 data = np.ascontiguousarray(data)
-            if data.nbytes != self.nelems() * self.dtype().element_size():
+            if data.nbytes != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
             rt.executor.tensor_write(self._tensor, data)
         else:
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 59bee5a9b..b6cf8a7a8 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -15,19 +15,24 @@ static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
                          py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_write(tensor, reinterpret_cast<void *>(info.ptr),
-                      info.size * info.itemsize);
+                      info.size * info.itemsize, false);
 }
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
-                         size_t host_address, size_t bytes) {
-    exe->tensor_write(tensor, reinterpret_cast<void *>(host_address), bytes);
+                         size_t address, size_t bytes, bool is_d2d) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                         py::buffer host_buffer) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_read(tensor, reinterpret_cast<void *>(info.ptr),
-                     info.size * info.itemsize);
+                     info.size * info.itemsize, false);
+}
+
+static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
+                        size_t address, size_t bytes, bool is_d2d) {
+    exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
 DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
@@ -59,6 +64,7 @@ void register_executor(py::module &m) {
             py::init<int, int, int, const std::string &, const std::string &>(),
             py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
             py::arg("name"), py::arg("plan"))
+        .def("gpu_id", &ark::Executor::gpu_id)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))
@@ -67,7 +73,16 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
+        .def(
+            "tensor_read",
+            py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
+                &tensor_read),
+            py::arg("tensor"), py::arg("data"))
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, bool>(&tensor_read),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("is_d2d"))
         .def(
             "tensor_write",
             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer>(
@@ -75,8 +90,8 @@ void register_executor(py::module &m) {
             py::arg("tensor"), py::arg("data"))
         .def("tensor_write",
              py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
-                               size_t>(&tensor_write),
-             py::arg("tensor"), py::arg("address"), py::arg("bytes"))
-        .def("get_dl_tensor", &to_dlpack_capsule),
-        py::arg("tensor");
+                               size_t, bool>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("is_d2d"))
+        .def("get_dl_tensor", &to_dlpack_capsule);
 }
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 8f941a033..5befa1c34 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -1,7 +1,14 @@
-import torch
+import pytest
 import numpy as np
 import ark
 
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    _no_torch = True
+
 
 def initialize_tensor(dimensions, dtype):
     tensor = ark.tensor(dimensions, dtype)
@@ -11,6 +18,8 @@ def initialize_tensor(dimensions, dtype):
 
 # Test function to validate the integrity of the PyTorch view of the ARK tensor,
 # including its data and attributes such as shape and data type.
+@pytest.mark.parametrize("num_dims,size", [(1, 5), (1, 1024), (2, 5), (2, 32)])
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
 def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     ark.init()
     dimensions = [size] * num_dims
@@ -59,6 +68,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index):
 
 
 # Test function to check if changes to the torch views are reflected in the original tensors
+@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
 def test_aliasing(dtype: ark.DataType):
     ark.init()
     dimensions = [4, 4]
@@ -91,3 +101,28 @@ def test_aliasing(dtype: ark.DataType):
 
     runtime.stop()
     runtime.reset()
+
+
+def test_conversion_torch():
+    if _no_torch:
+        pytest.skip("PyTorch not available")
+
+    dimensions = [4, 4]
+
+    ark.init()
+    t = ark.constant(7, dimensions)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        torch_tensor = t.to_torch()
+
+        assert torch_tensor.shape == (4, 4)
+        assert torch_tensor.dtype == torch.float32
+        assert torch_tensor.device.type == "cuda"
+        assert torch.all(torch_tensor == 0)
+
+        rt.run()
+
+        torch_tensor = t.to_torch()
+        assert torch.all(torch_tensor == 7)

From 20c23f34b17ecfa24d96ffa8799c3c173b468c53 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 24 Jun 2024 23:58:59 +0000
Subject: [PATCH 012/106] lint

---
 ark/api/executor.cpp | 46 +++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 96e53c8cf..ae3e5f499 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -158,8 +158,8 @@ class Executor::Impl {
 
     void tensor_read(const Tensor tensor, void *data, size_t bytes,
                      bool is_d2d) const;
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes, bool is_d2d) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      bool is_d2d) const;
     DLDeviceType get_device_type() const;
     DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
@@ -733,8 +733,8 @@ void Executor::Impl::barrier() {
     }
 }
 
-void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes, bool is_d2d) const {
+void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                                 bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
@@ -760,15 +760,15 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data,
         copy_stream_->sync();
         if (!is_d2d) {
             tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                    tensor.shape(), tensor.strides(), tensor.offsets(),
-                    tensor.data_type().bytes());
+                           tensor.shape(), tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
             return;
         }
         // TODO: convert data layout on the device directly
         std::vector<int8_t> data_host(bytes);
-        tensor_to_data(tensor_host.data(), data_host.data(),
-                tensor.shape(), tensor.strides(), tensor.offsets(),
-                tensor.data_type().bytes());
+        tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(),
+                       tensor.strides(), tensor.offsets(),
+                       tensor.data_type().bytes());
         GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
     }
@@ -794,22 +794,24 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
     void *dst = buffer_->ref(offset);
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
+        GLOG(
+            gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
         if (!is_d2d) {
-            data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                        tensor.shape(), tensor.strides(), tensor.offsets(),
-                        tensor.data_type().bytes());
+            data_to_tensor(tensor_host.data(),
+                           static_cast<const int8_t *>(data), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
         } else {
             // TODO: convert data layout on the device directly
             std::vector<int8_t> tmp(bytes);
-            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes,
-                                gpuMemcpyDeviceToHost, copy_stream_->get()));
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost,
+                                copy_stream_->get()));
             copy_stream_->sync();
-            data_to_tensor(tensor_host.data(), tmp.data(),
-                        tensor.shape(), tensor.strides(), tensor.offsets(),
-                        tensor.data_type().bytes());
+            data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
         }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
                             gpuMemcpyHostToDevice, copy_stream_->get()));
@@ -932,13 +934,13 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes, bool is_d2d) const {
+void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                           bool is_d2d) const {
     impl_->tensor_read(tensor, data, bytes, is_d2d);
 }
 
-void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes, bool is_d2d) const {
+void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                            bool is_d2d) const {
     impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 

From ebe85604cb7249b4e0d7d6c3eed69758c4c6825f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 01:21:42 +0000
Subject: [PATCH 013/106] Seperate DLPack from C++ interfaces

---
 ark/api/executor.cpp         | 127 +++++------------------------------
 ark/include/ark/executor.hpp |   8 +--
 python/executor_py.cpp       |  90 ++++++++++++++++++++++++-
 3 files changed, 106 insertions(+), 119 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index ae3e5f499..ebfa7016d 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -156,12 +156,12 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     void tensor_read(const Tensor tensor, void *data, size_t bytes,
                      bool is_d2d) const;
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d) const;
-    DLDeviceType get_device_type() const;
-    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
 
    private:
     void init_communicator();
@@ -733,6 +733,15 @@ void Executor::Impl::barrier() {
     }
 }
 
+uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+}
+
 void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
                                  bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
@@ -742,13 +751,8 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
         ERR(InvalidUsageError, "Destination bytes (", bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
-    }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = buffer_->ref(offset);
+    void *src = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_->get()));
     } else {
@@ -784,15 +788,10 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
         ERR(InvalidUsageError, "Source bytes (", bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
-    }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = buffer_->ref(offset);
+    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
         GLOG(
             gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_->get()));
@@ -819,94 +818,6 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
     copy_stream_->sync();
 }
 
-DLDeviceType Executor::Impl::get_device_type() const {
-#if defined(ARK_CUDA)
-    return kDLCUDA;
-#elif defined(ARK_ROCM)
-    return kDLROCM;
-#else
-    return kDLCPU;
-#endif
-}
-
-DLDataType get_dl_dtype(const DataType &ark_data_type) {
-    DLDataType dl_data_type;
-    dl_data_type.lanes = 1;
-    if (ark_data_type == FP32) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == FP16) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == BF16) {
-        dl_data_type.code = kDLBfloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == INT32) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == UINT32) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == INT8) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == UINT8) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == BYTE) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else {
-        ERR(InvalidUsageError, "Unsupported data type");
-    }
-    return dl_data_type;
-}
-
-DLManagedTensor *Executor::Impl::get_dl_tensor(const Tensor &tensor) const {
-    DLTensor dl_tensor;
-    dl_tensor.data =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
-    size_t offset_in_elements =
-        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
-    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
-    dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(gpu_id_);
-    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
-    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
-
-    dl_tensor.shape =
-        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    dl_tensor.strides =
-        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    auto shape = tensor.shape();
-    if (dl_tensor.shape) {
-        for (int i = 0; i < dl_tensor.ndim; ++i) {
-            dl_tensor.shape[i] = shape[i];
-        }
-    }
-    if (dl_tensor.strides) {
-        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
-        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
-            dl_tensor.strides[i] =
-                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
-        }
-    }
-    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
-    dl_managed_tensor->dl_tensor = dl_tensor;
-    dl_managed_tensor->manager_ctx = nullptr;
-    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
-        if (self->dl_tensor.shape) {
-            delete[] self->dl_tensor.shape;
-            self->dl_tensor.shape = nullptr;
-        }
-        if (self->dl_tensor.strides) {
-            delete[] self->dl_tensor.strides;
-            self->dl_tensor.strides = nullptr;
-        }
-    };
-    return dl_managed_tensor;
-}
-
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
     : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
@@ -934,6 +845,10 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
+uintptr_t Executor::tensor_address(const Tensor tensor) const {
+    return impl_->tensor_address(tensor);
+}
+
 void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
                            bool is_d2d) const {
     impl_->tensor_read(tensor, data, bytes, is_d2d);
@@ -944,14 +859,6 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
     impl_->tensor_write(tensor, data, bytes, is_d2d);
 }
 
-DLDeviceType Executor::get_device_type() const {
-    return impl_->get_device_type();
-}
-
-DLManagedTensor *Executor::get_dl_tensor(const Tensor &tensor) const {
-    return impl_->get_dl_tensor(tensor);
-}
-
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
                                  const std::string &name)
     : Executor(
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index a5d6f0273..b8cdaf273 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -4,8 +4,6 @@
 #ifndef ARK_EXECUTOR_HPP
 #define ARK_EXECUTOR_HPP
 
-#include <dlpack/dlpack.h>
-
 #include <ark/model_ref.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
@@ -50,6 +48,8 @@ class Executor {
 
     bool destroyed() const;
 
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     template <typename T>
     void tensor_read(const Tensor tensor, std::vector<T> &data) const {
         tensor_read(tensor, reinterpret_cast<void *>(data.data()),
@@ -68,10 +68,6 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d = false) const;
 
-    DLManagedTensor *get_dl_tensor(const Tensor &tensor) const;
-
-    DLDeviceType get_device_type() const;
-
    private:
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index b6cf8a7a8..e5ab4f964 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -9,6 +9,7 @@
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
 #include <iostream>
+#include <stdexcept>
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -35,9 +36,92 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
     exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes, is_d2d);
 }
 
-DLManagedTensor *to_dlpack(ark::Executor &exe, const ark::Tensor &tensor) {
-    DLManagedTensor *dl_tensor = exe.get_dl_tensor(tensor);
-    return dl_tensor;
+static DLDataType get_dl_dtype(const ark::DataType &ark_data_type) {
+    DLDataType dl_data_type;
+    dl_data_type.lanes = 1;
+    if (ark_data_type == ark::FP32) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::FP16) {
+        dl_data_type.code = kDLFloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == ark::BF16) {
+        dl_data_type.code = kDLBfloat;
+        dl_data_type.bits = 16;
+    } else if (ark_data_type == ark::INT32) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::UINT32) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 32;
+    } else if (ark_data_type == ark::INT8) {
+        dl_data_type.code = kDLInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == ark::UINT8) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else if (ark_data_type == ark::BYTE) {
+        dl_data_type.code = kDLUInt;
+        dl_data_type.bits = 8;
+    } else {
+        throw std::runtime_error("unexpected error");
+    }
+    return dl_data_type;
+}
+
+static DLDeviceType get_device_type() {
+#if defined(ARK_CUDA)
+    return kDLCUDA;
+#elif defined(ARK_ROCM)
+    return kDLROCM;
+#else
+    return kDLCPU;
+#endif
+}
+
+static DLManagedTensor *to_dlpack(ark::Executor &exe,
+                                  const ark::Tensor &tensor) {
+    DLTensor dl_tensor;
+    dl_tensor.data = reinterpret_cast<void *>(exe.tensor_address(tensor));
+    size_t offset_in_elements =
+        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
+    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = static_cast<int32_t>(exe.gpu_id());
+    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
+    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
+
+    dl_tensor.shape =
+        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    dl_tensor.strides =
+        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
+    auto shape = tensor.shape();
+    if (dl_tensor.shape) {
+        for (int i = 0; i < dl_tensor.ndim; ++i) {
+            dl_tensor.shape[i] = shape[i];
+        }
+    }
+    if (dl_tensor.strides) {
+        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
+        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
+            dl_tensor.strides[i] =
+                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+        }
+    }
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = dl_tensor;
+    dl_managed_tensor->manager_ctx = nullptr;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->dl_tensor.shape) {
+            delete[] self->dl_tensor.shape;
+            self->dl_tensor.shape = nullptr;
+        }
+        if (self->dl_tensor.strides) {
+            delete[] self->dl_tensor.strides;
+            self->dl_tensor.strides = nullptr;
+        }
+    };
+    return dl_managed_tensor;
 }
 
 void free_capsule(PyObject *capsule) {

From 08c9b899c22b759a6f4f194b7932f48d08eeb8f4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 01:30:50 +0000
Subject: [PATCH 014/106] Update workflow trigger

---
 .github/workflows/ut-cuda.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 5a78818ff..918c1a4a8 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -7,8 +7,7 @@ on:
   pull_request:
     branches:
       - main
-    types:
-      - ready_for_review
+    types: [opened, synchronize, reopened, ready_for_review]
 
 jobs:
   UnitTest:

From 1fa08afa36010116cdcd6d89e64db104f3fa23d1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 25 Jun 2024 20:53:29 +0000
Subject: [PATCH 015/106] expose exceptions

---
 ark/api/dims.cpp                |  1 -
 ark/include/ark.hpp             |  1 +
 ark/{ => include/ark}/error.hpp | 15 ++++++++++-----
 ark/logging.h                   |  2 +-
 python/ark/__init__.py          | 12 ++++++++++++
 python/ark/error.py             | 12 ++++++++++++
 python/ark_py.cpp               |  2 ++
 python/error_py.cpp             | 25 +++++++++++++++++++++++++
 python/unittest/test_error.py   | 12 ++++++++++++
 9 files changed, 75 insertions(+), 7 deletions(-)
 rename ark/{ => include/ark}/error.hpp (70%)
 create mode 100644 python/ark/error.py
 create mode 100644 python/error_py.cpp
 create mode 100644 python/unittest/test_error.py

diff --git a/ark/api/dims.cpp b/ark/api/dims.cpp
index a2830a060..a1f03b426 100644
--- a/ark/api/dims.cpp
+++ b/ark/api/dims.cpp
@@ -5,7 +5,6 @@
 
 #include <vector>
 
-#include "error.hpp"
 #include "logging.h"
 
 namespace ark {
diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp
index a7b2f7f70..2ca796172 100644
--- a/ark/include/ark.hpp
+++ b/ark/include/ark.hpp
@@ -10,6 +10,7 @@
 
 #include <ark/data_type.hpp>
 #include <ark/dims.hpp>
+#include <ark/error.hpp>
 #include <ark/executor.hpp>
 #include <ark/init.hpp>
 #include <ark/model_graph.hpp>
diff --git a/ark/error.hpp b/ark/include/ark/error.hpp
similarity index 70%
rename from ark/error.hpp
rename to ark/include/ark/error.hpp
index e08acd975..78d02cab3 100644
--- a/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -1,17 +1,21 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_ERROR_HPP_
-#define ARK_ERROR_HPP_
+#ifndef ARK_ERROR_HPP
+#define ARK_ERROR_HPP
 
 #include <stdexcept>
 #include <string>
 
 namespace ark {
 
-class BaseError : public std::runtime_error {
+class BaseError : public std::exception {
+   private:
+    std::string msg_;
+
    public:
-    BaseError(const std::string &msg) : std::runtime_error(msg) {}
+    BaseError(const std::string &msg) : msg_(msg) {}
+    const char *what() const noexcept override { return msg_.c_str(); }
 };
 
 #define REGISTER_ERROR_TYPE(_name)                        \
@@ -20,6 +24,7 @@ class BaseError : public std::runtime_error {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+REGISTER_ERROR_TYPE(InternalError)
 REGISTER_ERROR_TYPE(InvalidUsageError)
 REGISTER_ERROR_TYPE(NotFoundError)
 REGISTER_ERROR_TYPE(ModelError)
@@ -32,4 +37,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP_
+#endif  // ARK_ERROR_HPP
diff --git a/ark/logging.h b/ark/logging.h
index d29793ff7..6eb8aaf91 100644
--- a/ark/logging.h
+++ b/ark/logging.h
@@ -8,7 +8,7 @@
 #include <sstream>
 #include <string>
 
-#include "error.hpp"
+#include "ark/error.hpp"
 
 namespace ark {
 
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 2a4d164e4..3d162c3e4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -91,3 +91,15 @@ def set_world_size(world_size):
     ones,
     zeros,
 )
+from .error import (
+    InternalError,
+    InvalidUsageError,
+    NotFoundError,
+    ModelError,
+    SchedulerError,
+    ExecutorError,
+    SystemError,
+    GpuError,
+    RuntimeError,
+)
+
diff --git a/python/ark/error.py b/python/ark/error.py
new file mode 100644
index 000000000..d3ac3aee8
--- /dev/null
+++ b/python/ark/error.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from _ark_core import _InternalError as InternalError
+from _ark_core import _InvalidUsageError as InvalidUsageError
+from _ark_core import _NotFoundError as NotFoundError
+from _ark_core import _ModelError as ModelError
+from _ark_core import _SchedulerError as SchedulerError
+from _ark_core import _ExecutorError as ExecutorError
+from _ark_core import _SystemError as SystemError
+from _ark_core import _GpuError as GpuError
+from _ark_core import _RuntimeError as RuntimeError
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 35c3b21c3..1bc4255d6 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -9,6 +9,7 @@ namespace py = pybind11;
 
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
+extern void register_error(py::module &m);
 extern void register_executor(py::module &m);
 extern void register_init(py::module &m);
 extern void register_model_graph(py::module &m);
@@ -23,6 +24,7 @@ PYBIND11_MODULE(_ark_core, m) {
 
     register_data_type(m);
     register_dims(m);
+    register_error(m);
     register_executor(m);
     register_init(m);
     register_model_graph(m);
diff --git a/python/error_py.cpp b/python/error_py.cpp
new file mode 100644
index 000000000..863d8423d
--- /dev/null
+++ b/python/error_py.cpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/error.hpp>
+
+namespace py = pybind11;
+
+#define REGISTER_ERROR_PY(_name) \
+    py::register_exception<ark::_name>(m, "_" #_name)
+
+void register_error(py::module &m) {
+    REGISTER_ERROR_PY(InternalError);
+    REGISTER_ERROR_PY(InvalidUsageError);
+    REGISTER_ERROR_PY(NotFoundError);
+    REGISTER_ERROR_PY(ModelError);
+    REGISTER_ERROR_PY(SchedulerError);
+    REGISTER_ERROR_PY(ExecutorError);
+    REGISTER_ERROR_PY(SystemError);
+    REGISTER_ERROR_PY(GpuError);
+    REGISTER_ERROR_PY(RuntimeError);
+}
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
new file mode 100644
index 000000000..c063c05c5
--- /dev/null
+++ b/python/unittest/test_error.py
@@ -0,0 +1,12 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+
+
+def test_error():
+    ark.init()
+    try:
+        ark.tensor([0])
+    except Exception as e:
+        assert isinstance(e, ark.InvalidUsageError)

From 59caff1eddb0a01c4f7bdf6e082b96d22e10ad6e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 26 Jun 2024 23:25:35 +0000
Subject: [PATCH 016/106] Build python module by default

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1e3566e..9ba2f2c55 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ option(USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(USE_ROCM "Use AMD/ROCm." OFF)
 option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(BUILD_TESTS "Build unit tests." ON)
+option(BUILD_PYTHON "Build Python module." ON)
 
 if(BYPASS_GPU_CHECK)
     if(USE_CUDA)

From efb2c78145cab0832971205911320264bbe74870 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 29 Jun 2024 03:51:19 +0000
Subject: [PATCH 017/106] revert

---
 ark/include/kernels/kernel_template.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index 876e6a1b4..ea1862920 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -64,5 +64,6 @@ void @NAME@(char *_buf, int *_iter) {
     if (threadIdx.x == 0 && blockIdx.x == 0) {
       atomicStoreRelaxed(_iter, 0);
     }
+    sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }

From 8975f9d4a0574f0421e79f6dd49e7443e7244606 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 29 Jun 2024 04:03:20 +0000
Subject: [PATCH 018/106] Do not use `sys.path` for importing `_ark_core`

---
 python/ark/__init__.py |  5 +----
 python/ark/error.py    | 18 +++++++++---------
 python/ark/init.py     |  2 +-
 python/ark/model.py    |  2 +-
 python/ark/runtime.py  |  2 +-
 python/ark/tensor.py   |  2 +-
 6 files changed, 14 insertions(+), 17 deletions(-)

diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 3d162c3e4..031afc7ba 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -7,9 +7,7 @@
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from . import _ark_core
 from .model import Model
 
 
@@ -102,4 +100,3 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
-
diff --git a/python/ark/error.py b/python/ark/error.py
index d3ac3aee8..40f7391ac 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _NotFoundError as NotFoundError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _SchedulerError as SchedulerError
-from _ark_core import _ExecutorError as ExecutorError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
-from _ark_core import _RuntimeError as RuntimeError
+from ._ark_core import _InternalError as InternalError
+from ._ark_core import _InvalidUsageError as InvalidUsageError
+from ._ark_core import _NotFoundError as NotFoundError
+from ._ark_core import _ModelError as ModelError
+from ._ark_core import _SchedulerError as SchedulerError
+from ._ark_core import _ExecutorError as ExecutorError
+from ._ark_core import _SystemError as SystemError
+from ._ark_core import _GpuError as GpuError
+from ._ark_core import _RuntimeError as RuntimeError
diff --git a/python/ark/init.py b/python/ark/init.py
index dbf7c1569..32f530791 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..87af88f49 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from ._ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 798eaf9d5..efae6ab3c 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -5,7 +5,7 @@
 from enum import Enum
 from typing import Callable, Dict, List
 
-from _ark_core import _Executor, _DefaultPlanner
+from ._ark_core import _Executor, _DefaultPlanner
 from .model import Model
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index eff1bf20e..ac2886960 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from _ark_core import _Dims, _Tensor, _NullTensor
+from ._ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model

From 153837ba60497413d70c90fed945eaa037c84a29 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 2 Jul 2024 04:09:10 +0000
Subject: [PATCH 019/106] wip

---
 ark/api/executor.cpp                   |   51 +-
 ark/codegen.cpp                        |    3 +-
 ark/include/ark/executor.hpp           |    7 +-
 ark/include/kernels/common/broadcast.h |    4 +-
 ark/model/model_json.cpp               |   11 +-
 ark/model/model_json.hpp               |    2 +-
 ark/model/model_op.cpp                 |    5 +-
 ark/ops/ops_all_reduce_test.cpp        |    2 +-
 ark/ops/ops_arithmetic_test.cpp        |   48 +-
 ark/ops/ops_embedding_test.cpp         |    2 +-
 ark/ops/ops_matmul.cpp                 |   30 +-
 ark/ops/ops_test_common.cpp            |   10 +-
 ark/ops/ops_test_common.hpp            |    6 +-
 examples/llama/README.md               |    4 +-
 examples/llama/model_test.py           |   88 +-
 plan_gpu0.json                         | 2504 ++++++++++++++++++++++++
 python/ark/__init__.py                 |    1 +
 python/ark/profiler.py                 |   30 +
 python/executor_py.cpp                 |    1 +
 19 files changed, 2706 insertions(+), 103 deletions(-)
 create mode 100644 plan_gpu0.json
 create mode 100644 python/ark/profiler.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index b052040ef..4af9df7c0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -143,11 +143,13 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int rank, int world_size, int gpu_id, const std::string &name,
-         const std::string &plan);
+    Impl(int rank, int world_size, int gpu_id, const std::string &name);
     ~Impl() = default;
 
+    void init(const std::string &plan);
+
     int gpu_id() const { return gpu_id_; }
+    std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
     void launch(int64_t max_spin_count);
@@ -173,11 +175,13 @@ class Executor::Impl {
     const int rank_;
     const int world_size_;
     int gpu_id_;
+    std::string name_;
 
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    PlanJson plan_json_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
@@ -199,8 +203,8 @@ class Executor::Impl {
 };
 
 Executor::Impl::Impl(int rank, int world_size, int gpu_id,
-                     const std::string &name, const std::string &plan)
-    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) {
+                     const std::string &name)
+    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id), name_(name) {
     if (rank < 0 || rank >= world_size) {
         ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ",
             world_size);
@@ -211,17 +215,18 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
     if (world_size_ > 1) {
         init_communicator();
     }
+}
 
-    Json plan_json;
+void Executor::Impl::init(const std::string &plan) {
     auto &plan_path = get_env().enforce_plan_path;
     if (!plan_path.empty()) {
         LOG(INFO, "Enforce executor plan path: ", plan_path);
-        plan_json = Json::parse(read_file(plan_path));
+        plan_json_ = Json::parse(read_file(plan_path));
     } else {
-        plan_json = Json::parse(plan);
+        plan_json_ = Json::parse(plan);
     }
 
-    buffer_id_to_offset_ = init_buffers(plan_json);
+    buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
     for (const auto &kv : buffer_id_to_offset_) {
@@ -230,7 +235,7 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
     }
 
     codegen_ =
-        std::make_shared<CodeGenerator>(plan_json, buffer_id_to_offset_, name);
+        std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_, name_);
 
     auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
@@ -249,13 +254,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
     if (world_size_ > 1) {
-        auto remote_ranks = init_remote_ranks(plan_json);
+        auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
 
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
         gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name,
+        std::max(smem_block_total, size_t(4)), name_,
         {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
          std::pair<void *, size_t>{flag, sizeof(flag)}}));
 }
@@ -812,13 +817,18 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
 
 Executor::Executor(int rank, int world_size, int gpu_id,
                    const std::string &name, const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
-                                             plan)) {}
+    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name)) {
+    if (!plan.empty()) {
+        impl_->init(plan);
+    }
+}
 
 Executor::~Executor() = default;
 
 int Executor::gpu_id() const { return impl_->gpu_id(); }
 
+std::string Executor::plan() const { return impl_->plan(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -852,14 +862,17 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
 }
 
 DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
-                                 const std::string &name)
+                                 const std::vector<DefaultPlanner::ConfigRule>& config_rules,
+                                 const std::string& name)
     : Executor(
           model.rank(), model.world_size(),
           (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id,
-          name,
-          DefaultPlanner(model, (gpu_id < 0) ? (model.rank() %
-                                                get_env().num_ranks_per_host)
-                                             : gpu_id)
-              .plan()) {}
+          name, "") {
+    DefaultPlanner planner(model, impl_->gpu_id());
+    for (const auto &rule : config_rules) {
+        planner.install_config_rule(rule);
+    }
+    impl_->init(planner.plan());
+}
 
 }  // namespace ark
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..09ff28dd3 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -305,7 +305,8 @@ std::string CodeGenerator::Impl::resource_group(
             n_slots = total_warps / num_warps_per_task;
         }
         if (n_slots == 0) {
-            ERR(SchedulerError, "not enough resources for task group");
+            ERR(SchedulerError, "not enough resources for task group: ",
+                tg.dump());
         }
 
         size_t task_b = *task_range.begin();
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index b8cdaf273..2473e1b14 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -5,6 +5,7 @@
 #define ARK_EXECUTOR_HPP
 
 #include <ark/model_ref.hpp>
+#include <ark/planner.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
@@ -24,6 +25,9 @@ class Executor {
     /// Return the GPU ID.
     int gpu_id() const;
 
+    /// Return the plan string.
+    std::string plan() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -68,7 +72,7 @@ class Executor {
     void tensor_write(const Tensor tensor, const void *data, size_t bytes,
                       bool is_d2d = false) const;
 
-   private:
+   protected:
     class Impl;
     std::unique_ptr<Impl> impl_;
 };
@@ -78,6 +82,7 @@ class Model;
 class DefaultExecutor : public Executor {
    public:
     DefaultExecutor(const Model &model, int gpu_id = -1,
+                    const std::vector<DefaultPlanner::ConfigRule>& config_rules = {},
                     const std::string &name = "DefaultExecutor");
 };
 
diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h
index 97b12e004..858938613 100644
--- a/ark/include/kernels/common/broadcast.h
+++ b/ark/include/kernels/common/broadcast.h
@@ -186,9 +186,9 @@ struct Broadcast2Intrinsic {
         (BroadcastInput0 && BroadcastInput1)
             ? OutNelemPerThread
             : BroadcastInput0
-                  ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
+                  ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
                   : BroadcastInput1
-                        ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
+                        ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
                         : math::gcd<OutNelemPerThread,
                                     math::gcd<In0NelemPerThread,
                                               In1NelemPerThread>::value>::value;
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 0057ef0aa..97ce71967 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) {
     }
 }
 
-PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); }
+PlanJson::PlanJson(const Json &json)
+    : Json((json != nullptr) ? json
+                             : Json{{"Rank", 0},
+                                    {"WorldSize", 1},
+                                    {"NumProcessors", 1},
+                                    {"NumWarpsPerProcessor", 1},
+                                    {"TaskInfos", Json::array()},
+                                    {"ProcessorGroups", Json::array()}}) {
+    verify_format_plan(*this);
+}
 
 static std::stringstream &dump_pretty_plan(const Json &json,
                                            std::stringstream &ss, int indent,
diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp
index cf5fbbce2..e42640a9a 100644
--- a/ark/model/model_json.hpp
+++ b/ark/model/model_json.hpp
@@ -18,7 +18,7 @@ class ModelJson : public Json {
 
 class PlanJson : public Json {
    public:
-    PlanJson(const Json &json);
+    PlanJson(const Json &json = nullptr);
     std::string dump_pretty(int indent = 0, int indent_step = 2) const;
 };
 
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 6cdba5d02..b5a0645c8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -202,8 +202,11 @@ std::shared_ptr<ModelOp> ModelOp::deserialize(const Json &serialized) {
     } else if (!serialized.contains("Args")) {
         ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args");
     }
+    // Run `ModelOpT::from_name` before `construct()` to ensure all operators
+    // are registered.
+    auto op_type = ModelOpT::from_name(serialized["Type"]);
     auto ret = model_op_factory()->construct(serialized["Type"]);
-    ret->type_ = ModelOpT::from_name(serialized["Type"]);
+    ret->type_ = op_type;
     ret->name_ = serialized["Name"];
     ret->is_virtual_ = serialized["IsVirtual"];
     for (const auto &t : serialized["ReadTensors"]) {
diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 9e2c6f675..54c6426fa 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -94,7 +94,7 @@ void test_all_reduce_internal(ark::DimType nelem) {
             auto result =
                 ark::op_test("all_reduce", m, {ones}, {output},
                              baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+                             {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp
index 3fdc5ac7e..c7c18b603 100644
--- a/ark/ops/ops_arithmetic_test.cpp
+++ b/ark/ops/ops_arithmetic_test.cpp
@@ -2,6 +2,7 @@
 // Licensed under the MIT license.
 
 #include "ops_test_common.hpp"
+#include "model/model_json.hpp"
 
 template <typename T>
 void baseline_add(std::vector<void *> &outputs,
@@ -142,12 +143,25 @@ ark::unittest::State test_add_fp32() {
 
 ark::unittest::State test_add_fp16() {
     ark::Model m;
-    ark::Tensor t0 = m.tensor({8192}, ark::FP16);
-    ark::Tensor t1 = m.tensor({8192}, ark::FP16);
+    ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16);
+    ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16);
     ark::Tensor out = m.add(t0, t1);
 
     auto result =
-        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>);
+        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>, {},
+        {
+            ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) {
+                auto op = ark::Json::parse(op_str);
+                ark::Json config;
+                if (op.at("Type") == "Add") {
+                    config["NumWarps"] = 4;
+                    config["SramBytes"] = 0;
+                    config["Tile"] = {128, 256};
+                    config["NumTasks"] = 4096;
+                }
+                return config.dump();
+            })
+        });
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
@@ -416,20 +430,20 @@ ark::unittest::State test_div_invalid() {
 
 int main() {
     ark::init();
-    UNITTEST(test_add_fp32);
+    // UNITTEST(test_add_fp32);
     UNITTEST(test_add_fp16);
-    UNITTEST(test_add_bf16);
-    UNITTEST(test_add_overwrite);
-    UNITTEST(test_add_broadcast);
-    UNITTEST(test_add_invalid);
-    UNITTEST(test_sub_fp32);
-    UNITTEST(test_sub_invalid);
-    UNITTEST(test_mul_fp32);
-    UNITTEST(test_mul_fp16);
-    UNITTEST(test_mul_overwrite);
-    UNITTEST(test_mul_broadcast);
-    UNITTEST(test_mul_invalid);
-    UNITTEST(test_div_fp32);
-    UNITTEST(test_div_invalid);
+    // UNITTEST(test_add_bf16);
+    // UNITTEST(test_add_overwrite);
+    // UNITTEST(test_add_broadcast);
+    // UNITTEST(test_add_invalid);
+    // UNITTEST(test_sub_fp32);
+    // UNITTEST(test_sub_invalid);
+    // UNITTEST(test_mul_fp32);
+    // UNITTEST(test_mul_fp16);
+    // UNITTEST(test_mul_overwrite);
+    // UNITTEST(test_mul_broadcast);
+    // UNITTEST(test_mul_invalid);
+    // UNITTEST(test_div_fp32);
+    // UNITTEST(test_div_invalid);
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp
index 822973106..4f9df046a 100644
--- a/ark/ops/ops_embedding_test.cpp
+++ b/ark/ops/ops_embedding_test.cpp
@@ -80,7 +80,7 @@ ark::unittest::State test_embedding() {
     }
     auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
                                baseline_embedding<T>,
-                               {ti_data.data(), tw_data.data()}, true);
+                               {ti_data.data(), tw_data.data()});
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b259f99c8..b4553a4ed 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -189,45 +189,55 @@ std::vector<ModelOpArg> ModelOpMatmul::impl_args([
 }
 
 static const Json get_default_config(const ArchRef arch,
-                                     const ModelDataType &data_type) {
+                                     const ModelDataType &data_type,
+                                     const Dims &mnk) {
+    if (data_type != FP32.ref() && data_type != FP16.ref() &&
+        data_type != BF16.ref()) {
+        ERR(InvalidUsageError,
+            "Unsupported data type: ", data_type->type_name());
+    }
+    if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) {
+        ERR(InvalidUsageError, "Unsupported architecture: ", arch->name());
+    }
+    DimType tm = (mnk[0] > mnk[1]) ? 256 : 128;
+    DimType tn = (mnk[0] > mnk[1]) ? 128 : 256;
     if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 16}}};
+                {"TileShapeMNK", {tm, tn, 16}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     }
-    ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(),
-        " and ", data_type->type_name());
+    ERR(InternalError, "Unexpected error");
     return {};
 }
 
 Json ModelOpMatmul::default_config(const ArchRef arch) const {
     auto result = result_tensors_[0];
-    Json config = get_default_config(arch, result->data_type());
     check_fields_args(args_, {"TransposeInput", "TransposeOther"});
     Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(),
                                  read_tensors_[1]->padded_shape(),
                                  args_.at("TransposeInput").value<bool>(),
                                  args_.at("TransposeOther").value<bool>());
+    Json config = get_default_config(arch, result->data_type(), mnk);
     size_t tile_x = config.at("TileShapeMNK")[0];
     size_t tile_y = config.at("TileShapeMNK")[1];
     if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) {
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 50317fba7..ad2c208b6 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -36,8 +36,9 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
                       const std::vector<Tensor> &outputs,
                       OpsTestBaseline baseline,
                       const std::vector<void *> &inputs_data,
-                      bool print_on_error, int rank, int world_size) {
-    DefaultExecutor exe(model);
+                      const std::vector<DefaultPlanner::ConfigRule>& config_rules,
+                      bool print_on_error) {
+    DefaultExecutor exe(model, -1, config_rules);
     exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
@@ -133,7 +134,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     for (auto t : gt) {
         gt_ptrs.push_back(t->data());
     }
-    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank);
+    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, model.rank());
 
     std::stringstream test_name;
     test_name << test_name_prefix;
@@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
 
     OpsTestResult result;
     result.test_name = test_name.str();
+    result.plan = exe.plan();
 
     // Compare results with the ground truth.
     for (size_t i = 0; i < outputs.size(); i++) {
@@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     GLOG(gpuDeviceSynchronize());
 
     // Throughput test.
-    if (world_size > 1) {
+    if (model.world_size() > 1) {
         // For multi-GPU, we need to make sure that all GPUs run the same
         // number of iterations. Rather than doing allgather, we just
         // use a magic number here.
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index 01e97dbb1..a32d9b748 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -10,6 +10,7 @@
 
 #include "ark/model.hpp"
 #include "ark/model_ref.hpp"
+#include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "bfloat16.h"
 #include "half.h"
@@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape,
 
 struct OpsTestResult {
     std::string test_name;
+    std::string plan;
     int iter;
     float msec_per_iter;
     std::vector<float> mse;
@@ -170,8 +172,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
                       const std::vector<Tensor> &outputs,
                       OpsTestBaseline baseline,
                       const std::vector<void *> &inputs_data = {},
-                      bool print_on_error = false, int rank = 0,
-                      int world_size = 1);
+                      const std::vector<DefaultPlanner::ConfigRule>& config_rules = {},
+                      bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);
 
diff --git a/examples/llama/README.md b/examples/llama/README.md
index 090dd1de3..1fe040ae0 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -29,10 +29,10 @@ Llama2 examples over ARK.
 4. Download Llama2 model weights and tokenizer weights.
     * The model and tokenizer should be compatible with the [official PyTorch implementation](https://github.com/facebookresearch/llama/blob/main/llama).
 
-5. Run the model accuracy test. `--pth_path` is the path to the model weights file (`consolidated.00.pth`).
+5. Run the model accuracy test. `--ckpt_dir` is the directory where the model weight files are at (e.g., `consolidated.00.pth`).
 
     ```bash
-    python3 model_test.py --pth_path=/path/to/model/weights.pth
+    python3 model_test.py --ckpt_dir=/directory/of/model/weights
     ```
 
 6. Test text generation. `--pth_path` is the path to the model weights file (`consolidated.00.pth`), `--tok_path` is the path to the tokenizer weights file (`tokenizer.model`), and `--params_path` is the path to the model parameters (`params.json`).
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 737d3ec8b..585341640 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -58,30 +58,34 @@ def run_ark(
     ]
     output = module(*module_inputs)
 
-    runtime = ark.Runtime()
-    # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd
-    runtime.launch(num_warps_per_sm=8)
+    with ark.Runtime() as rt:
+        rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json")
 
-    # Load model parameters
-    if state_dict:
-        module.load_state_dict(state_dict)
+        # Load model parameters
+        if state_dict:
+            print("Loading state_dict")
+            module.load_state_dict(state_dict)
+            print("Loading state_dict done")
 
-    # Load input data into tensors
-    tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
-    tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
-    for tensor, ndarray in zip(tensors, tensor_data):
-        tensor.from_numpy(ndarray)
+        # Load input data into tensors
+        tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
+        tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
+        for tensor, ndarray in zip(tensors, tensor_data):
+            tensor.from_numpy(ndarray)
 
-    start_time = time.time()
+        start_time = time.time()
 
-    # Run the model
-    runtime.run(iter=iterations)
+        # Run the model
+        print("Run:", iterations)
 
-    end_time = time.time()
+        rt.run(iter=iterations)
+        print("Run done")
 
-    if isinstance(output, list) or isinstance(output, tuple):
-        outputs = [o.to_numpy() for o in output]
-    outputs = [output.to_numpy()]
+        end_time = time.time()
+
+        if isinstance(output, list) or isinstance(output, tuple):
+            outputs = [o.to_numpy() for o in output]
+        outputs = [output.to_numpy()]
 
     return RunResults(outputs=outputs, runtime=end_time - start_time)
 
@@ -160,7 +164,9 @@ def test_module(
         else:
             prefix = module_name_prefix + "." if module_name_prefix else ""
             # Load the state_dict from the given path
+            print("Loading ckpt:", ckpt_path)
             state_dict_pt = torch.load(ckpt_path)
+            print("Loading ckpt done")
             state_dict_pt = {
                 k[len(prefix) :]: v
                 for k, v in state_dict_pt.items()
@@ -182,6 +188,7 @@ def test_module(
         rank=rank,
         world_size=world_size,
     )
+    print("Run ARK done")
 
     if not test_thru_ark_only:
         # PyTorch module
@@ -195,6 +202,7 @@ def test_module(
             inputs_pt,
             iterations=test_thru_iterations if test_thru else 1,
         )
+        print("Run PyTorch done")
 
         if test_thru:
             print(
@@ -447,26 +455,26 @@ def test_transformer_block(
     )
     output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
 
-    ark.Model.get_model().create_nodes()
-    print(ark.Model.get_model().serialize())
-
-    # test_module(
-    #     module_class_ark=model_ark.TransformerBlock,
-    #     module_args_ark=[
-    #         0,
-    #         args,
-    #         ark.DataType.from_numpy(dtype),
-    #         rank,
-    #         world_size,
-    #     ],
-    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
-    #     module_class_pt=model_pt.TransformerBlock,
-    #     module_args_pt=[0, args],
-    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-    #     module_name_prefix="layers.0",
-    #     rank=rank,
-    #     world_size=world_size,
-    # )
+    # print(ark.Model.get_model().serialize())
+
+    test_module(
+        module_class_ark=model_ark.TransformerBlock,
+        module_args_ark=[
+            0,
+            args,
+            ark.DataType.from_numpy(dtype),
+            rank,
+            world_size,
+        ],
+        inputs_ark=[feature, 0, freqs_cis_ark, None],
+        module_class_pt=model_pt.TransformerBlock,
+        module_args_pt=[0, args],
+        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
+        module_name_prefix="layers.0",
+        rank=rank,
+        world_size=world_size,
+        test_thru=True,
+    )
 
 
 def test_transformer(
@@ -570,7 +578,7 @@ def worker(
     # Configurations
     args = ModelArgs7B()
     batch_size = 1
-    seq_len = 512
+    seq_len = 2048
     dtype = np.float16
     world_size = ngpus
 
@@ -578,7 +586,7 @@ def worker(
     args.vocab_size = 32000
 
     # Reduce max_seq_len due to OOM from the PyTorch model
-    args.max_seq_len = 512
+    args.max_seq_len = 2048
 
     # Verify the configurations
     assert batch_size <= args.max_batch_size
diff --git a/plan_gpu0.json b/plan_gpu0.json
new file mode 100644
index 000000000..49b6bdd98
--- /dev/null
+++ b/plan_gpu0.json
@@ -0,0 +1,2504 @@
+{
+  "Rank": 0,
+  "WorldSize": 1,
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
+  "TaskInfos": [
+    {
+      "Id": 0,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 1,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [32,128],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 2,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 3,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 4,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [32,128],
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 5,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 6,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 7,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 8,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 9,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 10,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 11,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 12,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 13,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 14,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 15,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 16,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 17,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 18,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 19,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 20,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 21,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 22,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 23,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 24,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 25,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 26,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 27,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 28,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 29,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 30,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 31,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 32,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 33,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_9",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 34,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 35,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 36,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 38,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 39,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 40,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_10",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 41,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,128],
+            "NumTasks": 8192
+          }
+        }
+      ]
+    },
+    {
+      "Id": 42,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_11",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 43,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 44,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 45,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 46,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 47,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 48,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 49,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 50,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 51,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_12",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 52,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sigmoid",
+          "Name": "sigmoid",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 53,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 54,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_13",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 55,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_9",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 56,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_14",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 57,
+      "NumWarps": 4,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}},
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "WriteTensors": [
+            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "ResultTensors": [
+            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,256],
+            "NumTasks": 256
+          }
+        }
+      ]
+    }
+  ],
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":0,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":1,"TaskRange":[0,2048],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":2,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":3,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":5,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":6,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":7,"TaskRange":[0,8192],"Granularity":4}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":11,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":12,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":14,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":15,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":16,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":17,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":18,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":19,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":21,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":23,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":25,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":27,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":28,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":29,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":31,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":32,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":33,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":34,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":36,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":37,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":39,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":40,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":41,"TaskRange":[0,8192],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":42,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":43,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":44,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":45,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":47,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":48,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":49,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":50,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":51,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":52,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":53,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":54,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":55,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":56,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":57,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    }
+  ]
+}
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 031afc7ba..f2f604be9 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -100,3 +100,4 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
+from .profiler import Profiler
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
new file mode 100644
index 000000000..b959ceb18
--- /dev/null
+++ b/python/ark/profiler.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+import sys
+import time
+from .runtime import Runtime
+
+
+class Profiler:
+    def __init__(self, plan: str):
+        self.plan = json.loads(plan)
+
+    def run(self):
+        num_processor_groups = len(self.plan["ProcessorGroups"])
+        new_plan = {
+            "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"],
+            "NumProcessors": self.plan["NumProcessors"],
+            "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"],
+            "TaskInfos": self.plan["TaskInfos"],
+            "ProcessorGroups": [{}]}
+        for i in range(num_processor_groups):
+            new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i]
+            with Runtime() as rt:
+                rt.launch(plan=json.dumps(new_plan))
+                start_time = time.time()
+                iter = 1000
+                rt.run(iter=iter)
+                end_time = time.time()
+                sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n")
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e5ab4f964..a6e5308ee 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -149,6 +149,7 @@ void register_executor(py::module &m) {
             py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
             py::arg("name"), py::arg("plan"))
         .def("gpu_id", &ark::Executor::gpu_id)
+        .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))

From ff8c4b8fc4ff178befa375ffc8ac546806fa6c4b Mon Sep 17 00:00:00 2001
From: Noli Gerawork <86308445+naturalcandy@users.noreply.github.com>
Date: Tue, 2 Jul 2024 21:25:07 -0400
Subject: [PATCH 020/106] torch to ark (#217)

- Adds Torch to ARK tensor conversion support
- New ModelBufferManager class handles external buffer registration and
simplifies buffer access during kernel initialization
- Adds test cases for ARK to Torch conversion support

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 ark/api/executor.cpp               | 53 ++++++++++++++++---
 ark/api/tensor.cpp                 | 18 ++++++-
 ark/codegen.cpp                    | 36 +++++++++----
 ark/codegen.hpp                    |  4 +-
 ark/include/ark/tensor.hpp         |  2 +
 ark/model/model_buffer.cpp         | 55 ++++++++++++++++++--
 ark/model/model_buffer.hpp         | 15 ++++++
 ark/model_buffer_manager.hpp       | 58 +++++++++++++++++++++
 python/ark/tensor.py               | 26 +++++-----
 python/tensor_py.cpp               | 46 ++++++++++++++++-
 python/unittest/test_conversion.py | 81 +++++++++++++++++++++++++++++-
 11 files changed, 355 insertions(+), 39 deletions(-)
 create mode 100644 ark/model_buffer_manager.hpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4af9df7c0..0a780bcc0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -10,6 +10,7 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
+#include <tuple>
 
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
@@ -24,6 +25,7 @@
 #include "gpu/gpu_manager.h"
 #include "logging.h"
 #include "model/model_buffer.hpp"
+#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 #include "utils/utils_net.hpp"
@@ -234,8 +236,15 @@ void Executor::Impl::init(const std::string &plan) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ =
-        std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_, name_);
+    ModelBufferManager &buffer_manager = ModelBufferManager::get_instance();
+
+    if (!buffer_manager.is_empty()) {
+        codegen_ = std::make_shared<CodeGenerator>(
+            plan_json_, buffer_id_to_offset_, name, &buffer_manager);
+    } else {
+        codegen_ = std::make_shared<CodeGenerator>(plan_json_,
+                                                   buffer_id_to_offset_, name);
+    }
 
     auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
@@ -367,7 +376,16 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             }
             continue;
         }
-        buffer_id_to_offset[buf_info->buffer->id()] = offset;
+        if (buf_info->buffer->is_external()) {
+            if (buf_info->buffer->device_id() != gpu_id_) {
+                ERR(InvalidUsageError,
+                    "PyTorch tensor and model execution are on different GPUs");
+            }
+            continue;
+        } else {
+            buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            offset += buf_info->bytes;
+        }
         for (const auto &tag_info : buf_info->buffer->send_tags()) {
             remote_rank_to_send_tags_and_offsets[tag_info.first]
                 .first.push_back(tag_info.second);
@@ -380,7 +398,6 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             remote_rank_to_recv_tags_and_offsets[tag_info.first]
                 .second.push_back(offset);
         }
-        offset += buf_info->bytes;
     }
     total_bytes_ = offset;
 
@@ -456,7 +473,11 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] = offsets[i];
+            if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]
+                     ->buffer->is_external()) {
+                buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
     for (auto &kv : remote_rank_to_recv_tag_to_buffer_id) {
@@ -472,10 +493,13 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] = offsets[i];
+            if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]
+                     ->buffer->is_external()) {
+                buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
+                    offsets[i];
+            }
         }
     }
-
     return buffer_id_to_offset;
 }
 
@@ -742,6 +766,11 @@ uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
 void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
                                  bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
+    if (tensor.ref()->buffer()->is_external()) {
+        ERR(InvalidUsageError,
+            "Reading data from a tensor preallocated by PyTorch is not "
+            "supported. Use PyTorch's native methods.");
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
     if (bytes != tensor_data_bytes) {
@@ -779,6 +808,11 @@ void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
                                   size_t bytes, bool is_d2d) const {
     GLOG(gpuSetDevice(gpu_id_));
+    if (tensor.ref()->buffer()->is_external()) {
+        ERR(InvalidUsageError,
+            "Writing data to a tensor preallocated by PyTorch is not "
+            "supported. Use PyTorch's native methods.");
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
     if (bytes != tensor_data_bytes) {
@@ -843,7 +877,10 @@ float Executor::stop(int64_t max_spin_count) {
 
 void Executor::barrier() { impl_->barrier(); }
 
-void Executor::destroy() { impl_.reset(nullptr); }
+void Executor::destroy() {
+    ModelBufferManager::get_instance().clear_buffers();
+    impl_.reset(nullptr);
+}
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4b03c3ac8..4d33bd9f1 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -3,11 +3,25 @@
 
 #include "ark/tensor.hpp"
 
+#include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
 
 namespace ark {
 
+Tensor::Tensor(void* data_ptr, int32_t device_id,
+               const std::vector<int64_t>& shape,
+               const DataType& dtype) {
+    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
+                                                std::multiplies<int64_t>()) *
+                                dtype.bytes();
+    auto buffer =
+        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
+    auto tensor = std::make_shared<ModelTensor>(dtype.ref(), buffer, Dims(shape),
+                                                Dims(shape), Dims(), Dims());
+    ref_ = tensor;
+}
+
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
@@ -43,14 +57,14 @@ Dims Tensor::padded_shape() const {
     return Dims();
 }
 
-const DataType &Tensor::data_type() const {
+const DataType& Tensor::data_type() const {
     if (ref_) {
         return DataType::from_name(ref_->data_type()->type_name());
     }
     return NONE;
 }
 
-std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
+std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
     if (tensor.is_null()) {
         os << "null";
     } else {
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 09ff28dd3..a97e5e45b 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -10,6 +10,7 @@
 #include "file_io.h"
 #include "logging.h"
 #include "model/model_buffer.hpp"
+#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
@@ -43,7 +44,7 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::string &name);
+         const std::string &name, ModelBufferManager *buffer_manager);
     ~Impl() = default;
 
    private:
@@ -64,6 +65,8 @@ class CodeGenerator::Impl {
 
     std::string sync_process_range(const Range<size_t> &ranges, int state_id);
 
+    ModelBufferManager *buffer_manager_;
+
    protected:
     friend class CodeGenerator;
 
@@ -78,14 +81,18 @@ class CodeGenerator::Impl {
 
 CodeGenerator::Impl::Impl(const PlanJson &plan,
                           const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name)
-    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
+                          const std::string &name,
+                          ModelBufferManager *buffer_manager)
+    : buffer_id_to_offset_(buffer_id_to_offset),
+      name_(name),
+      buffer_manager_(buffer_manager) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
     num_warps_per_proc_ = plan.at("NumWarpsPerProcessor");
 
     std::stringstream definitions_ss;
+
     for (auto &task_json : plan.at("TaskInfos")) {
         definitions_ss << this->def_task(task_json);
     }
@@ -224,11 +231,19 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             auto &arg = impl_args[i];
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(tns->buffer()->id());
-                size_t offset = buffer_offset + ModelOffset(tns).value();
-                ss << "(" << tns->data_type()->type_str() << "*)&_buf["
-                   << offset << "]";
+                if (tns->buffer()->is_external()) {
+                    void *buf_addr =
+                        ModelBufferManager::get_instance().get_buffer(
+                            tns->buffer()->id());
+                    ss << "(" << tns->data_type()->type_str() << "*)"
+                       << buf_addr;
+                } else {
+                    size_t buffer_offset =
+                        buffer_id_to_offset_.at(tns->buffer()->id());
+                    size_t offset = buffer_offset + ModelOffset(tns).value();
+                    ss << "(" << tns->data_type()->type_str() << "*)&_buf["
+                       << offset << "]";
+                }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_offset =
@@ -431,8 +446,9 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
+    const std::string &name, ModelBufferManager *buffer_manager)
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name,
+                                   buffer_manager)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 4f8307e7e..a2976e644 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -8,6 +8,7 @@
 #include <memory>
 #include <string>
 
+#include "model_buffer_manager.hpp"
 #include "model/model_json.hpp"
 
 namespace ark {
@@ -16,7 +17,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::string &name = "ark_kernel");
+                  const std::string &name = "ark_kernel",
+                  ModelBufferManager *buffer_manager = nullptr);
 
     ~CodeGenerator() = default;
 
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 747ce5fea..d13748175 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -31,6 +31,8 @@ class Tensor {
     Tensor(ModelTensorRef ref) : ref_(ref) {}
     Tensor(const Tensor &other) = default;
     Tensor &operator=(const Tensor &other) = default;
+    Tensor(void *data_ptr, int32_t device_id, const std::vector<int64_t> &shape,
+           const DataType &dtype);
 
     bool operator==(const Tensor &other) const { return ref_ == other.ref_; }
     bool operator!=(const Tensor &other) const { return ref_ != other.ref_; }
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 4ce91b5e4..ce8f37727 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -4,13 +4,13 @@
 #include "model_buffer.hpp"
 
 #include "logging.h"
+#include "model_buffer_manager.hpp"
 
 namespace ark {
 
-ModelBuffer::ModelBuffer(int rank) : rank_(rank) {
-    static size_t id = 0;
-    id_ = id++;
-}
+size_t ModelBuffer::curr_id = 0;
+
+ModelBuffer::ModelBuffer(int rank) : rank_(rank) { id_ = curr_id++; }
 
 ModelBuffer::ModelBuffer(size_t id, int rank,
                          const std::vector<TagInfo> &send_tags,
@@ -24,6 +24,23 @@ ModelBuffer::ModelBuffer(size_t id, int rank,
     }
 }
 
+ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id)
+    : rank_(-1),
+      external_data_(data),
+      external_data_size_(size),
+      device_id_(device_id),
+      is_external_(true) {
+    id_ = curr_id++;
+}
+
+ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id)
+    : id_(id),
+      rank_(-1),
+      external_data_(data),
+      external_data_size_(size),
+      device_id_(device_id),
+      is_external_(true) {}
+
 void ModelBuffer::tag_send(int remote_rank, int tag) {
     send_tags_.insert(TagInfo{remote_rank, tag});
 }
@@ -46,6 +63,14 @@ Json ModelBuffer::serialize() const {
     }
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
+    j["IsExternal"] = is_external_;
+    if (is_external_) {
+        ModelBufferManager::get_instance().register_buffer(id_, external_data_,
+                                                           external_data_size_);
+        j["ExternalDataSize"] = external_data_size_;
+        j["DeviceId"] = device_id_;
+    }
+    // external_data_ptr_ is not included in JSON
     return j;
 }
 
@@ -62,6 +87,28 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
     } else if (!serialized.contains("RecvTags")) {
         ERR(InvalidUsageError,
             "ModelBuffer deserialization failed: missing RecvTags");
+    } else if (!serialized.contains("IsExternal")) {
+        ERR(InvalidUsageError,
+            "ModelBuffer deserialization failed: missing IsExternal");
+    }
+    if (serialized["IsExternal"]) {
+        if (!serialized.contains("ExternalDataSize")) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: missing ExternalDataSize");
+        } else if (!serialized.contains("DeviceId")) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: missing DeviceId");
+        }
+        void *data_ptr =
+            ModelBufferManager::get_instance().get_buffer(serialized["Id"]);
+        if (!data_ptr) {
+            ERR(InvalidUsageError,
+                "ModelBuffer deserialization failed: external buffer not found "
+                "in BufferManager");
+        }
+        return std::make_shared<ModelBuffer>(serialized["Id"], data_ptr,
+                                             serialized["ExternalDataSize"],
+                                             serialized["DeviceId"]);
     }
     return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
                                          serialized["SendTags"],
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index 7ad3db206..e7f1045b2 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -22,6 +22,10 @@ class ModelBuffer {
     ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
+    // externally managed buffer
+    ModelBuffer(void *data, size_t size, int32_t device_id);
+    ModelBuffer(size_t id, void *data, size_t size, int32_t device_id);
+
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
@@ -44,11 +48,22 @@ class ModelBuffer {
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
+    // external buffer management
+    size_t external_data_size() const { return external_data_size_; }
+    void *external_data() const { return external_data_; }
+    int32_t device_id() const { return device_id_; }
+    bool is_external() const { return is_external_; }
+
    private:
+    static size_t curr_id;
     size_t id_;
     int rank_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
+    void *external_data_ = nullptr;
+    size_t external_data_size_ = 0;
+    int32_t device_id_;
+    bool is_external_ = false;
 };
 
 }  // namespace ark
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
new file mode 100644
index 000000000..7b705f4c8
--- /dev/null
+++ b/ark/model_buffer_manager.hpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_MODEL_BUFFER_MANAGER_HPP_
+#define ARK_MODEL_BUFFER_MANAGER_HPP_
+
+#include <tuple>
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers not in the ARK memory space.
+class ModelBufferManager {
+   public:
+    static ModelBufferManager& get_instance() {
+        static ModelBufferManager instance;
+        return instance;
+    }
+
+    void register_buffer(size_t id, void* data, size_t size) {
+        buffers_[id] = std::make_tuple(data, size);
+    }
+
+    void* get_buffer(size_t id) {
+        auto it = buffers_.find(id);
+        if (it != buffers_.end()) {
+            return std::get<0>(it->second);
+        }
+        return nullptr;
+    }
+
+    size_t get_buffer_size(size_t id) {
+        auto it = buffers_.find(id);
+        if (it != buffers_.end()) {
+            return std::get<1>(it->second);
+        }
+        return 0;
+    }
+
+    const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
+        const {
+        return buffers_;
+    }
+
+    void clear_buffers() { buffers_.clear(); }
+
+    bool is_empty() const { return buffers_.empty(); }
+
+   private:
+    std::unordered_map<size_t, std::tuple<void*, size_t>>
+        buffers_;  // Maps buffer IDs to pointers and sizes.
+    size_t next_compact_id_ = 0;
+    ModelBufferManager() {}
+    ModelBufferManager(const ModelBufferManager&) = delete;
+    ModelBufferManager& operator=(const ModelBufferManager&) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_MODEL_BUFFER_MANAGER_HPP_
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index ac2886960..8f26dc96e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -167,18 +167,20 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
         return self
 
     @staticmethod
-    def from_torch(tensor: torch.Tensor):
-        return Tensor(
-            Model.get_model().tensor(
-                Dims(list(tensor.shape)),
-                DataType.from_torch(tensor.dtype).ctype(),
-                Dims(),
-                Dims(),
-                Dims(),
-                "",
-            ),
-            lambda: tensor,
-        )
+    def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
+        """
+        Returns an ARK tensor that shares the same memory with the torch tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        elif not tensor.is_contiguous():
+            raise ValueError("Torch tensor must be contiguous.")
+        elif tensor.device.type == "cpu":
+            raise ValueError("Torch tensor must be on a device.")
+        ark_dtype = DataType.from_torch(tensor.dtype)
+        dl_capsule = torch.utils.dlpack.to_dlpack(tensor)
+        ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
+        return Tensor(ark_tensor, runtime_id=runtime_id)
 
     def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
         """
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index fbd909d3d..16eb03421 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -9,8 +10,51 @@
 
 namespace py = pybind11;
 
-void register_tensor(py::module &m) {
+struct DLTensorMetadata {
+    void* data_ptr;
+    int32_t device_id;
+    DLDeviceType device_type;
+    int32_t ndim;
+    DLDataType dtype;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    uint64_t byte_offset;
+};
+
+static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
+    DLTensorMetadata metadata;
+    metadata.data_ptr = dl_tensor->dl_tensor.data;
+    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
+    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
+    metadata.ndim = dl_tensor->dl_tensor.ndim;
+    metadata.dtype = dl_tensor->dl_tensor.dtype;
+    metadata.shape.assign(
+        dl_tensor->dl_tensor.shape,
+        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
+    if (dl_tensor->dl_tensor.strides != nullptr) {
+        metadata.strides.assign(
+            dl_tensor->dl_tensor.strides,
+            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
+    }
+    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
+    return metadata;
+}
+
+void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
+        .def(py::init([](py::capsule capsule, const ark::DataType& dtype) {
+            DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
+            if (!dl_tensor) {
+                throw std::runtime_error(
+                    "Capsule does not contain a DLManagedTensor");
+            }
+            DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
+            int32_t device_id = metadata.device_id;
+            void* data_ptr = metadata.data_ptr;
+            auto shape = metadata.shape;
+
+            return new ark::Tensor(data_ptr, device_id, shape, dtype);
+        }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
         .def("strides", &ark::Tensor::strides,
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 5befa1c34..833b88662 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -1,6 +1,7 @@
 import pytest
 import numpy as np
 import ark
+from typing import Callable
 
 try:
     import torch
@@ -9,6 +10,8 @@
 except ImportError:
     _no_torch = True
 
+# ARK to Torch tests
+
 
 def initialize_tensor(dimensions, dtype):
     tensor = ark.tensor(dimensions, dtype)
@@ -69,7 +72,7 @@ def check_diff(input_tensor_host, input_view_numpy, value, index):
 
 # Test function to check if changes to the torch views are reflected in the original tensors
 @pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
-def test_aliasing(dtype: ark.DataType):
+def test_ark_to_torch_aliasing(dtype: ark.DataType):
     ark.init()
     dimensions = [4, 4]
     input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
@@ -126,3 +129,79 @@ def test_conversion_torch():
 
         torch_tensor = t.to_torch()
         assert torch.all(torch_tensor == 7)
+
+
+# Torch to ARK tests
+
+ArkBinOp = Callable[[ark.Tensor, ark.Tensor], ark.Tensor]
+TorchBinOp = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
+ArkUnOp = Callable[[ark.Tensor], ark.Tensor]
+TorchUnOp = Callable[[torch.Tensor], torch.Tensor]
+
+
+# Verify the accuracy of binary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Verify the accuracy of unary operations involving ARK view tensors
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+# Test function to check if changes in torch tensors are reflected in ARK views
+@pytest.mark.parametrize("dtype, tensor_dims", [(torch.float16, (64, 64))])
+def test_torch_to_ark_aliasing(dtype, tensor_dims):
+    ark.init()
+    # Initialize a PyTorch tensor
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+
+    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    other_ark_view = ark.Tensor.from_torch(other_tensor)
+
+    output = ark.add(input_ark_view, other_ark_view)
+    # Perform in place operations
+    input_tensor += other_tensor
+    other_tensor += input_tensor
+    expected_output = (input_tensor + other_tensor).cpu().numpy()
+
+    runtime = ark.Runtime()
+    runtime.launch()
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)

From fe35541e02029b0d9a8da4cbdccf2565cbf516b0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 3 Jul 2024 06:51:43 +0000
Subject: [PATCH 021/106] wip

---
 ark/api/executor.cpp                      |  22 +-
 ark/api/planner.cpp                       |   1 +
 ark/codegen.cpp                           |  16 +-
 ark/codegen.hpp                           |   3 +-
 ark/model/model_json.cpp                  |  14 +-
 ark/model_buffer_manager.hpp              |   5 +-
 cmake/Utils.cmake                         |   2 +-
 docs/plan_file.md                         |  18 +
 examples/llama/model_test.py              |   2 +-
 examples/tutorial/default_plan.json       | 115 +++---
 examples/tutorial/model.json              |  46 +--
 examples/tutorial/plan.json               |  63 ++--
 examples/tutorial/plan_1_larger_tile.json |  47 +--
 examples/tutorial/plan_2_split_k.json     |  63 ++--
 examples/tutorial/plan_3_overwrite.json   |  63 ++--
 examples/tutorial/plan_tutorial.py        |   4 +-
 plan_gpu0.json                            | 415 +++++++++++-----------
 python/ark/__init__.py                    |   3 +-
 python/ark/planner.py                     | 184 ++++++++++
 python/ark/profiler.py                    |  30 +-
 python/ark/runtime.py                     |  52 +--
 python/unittest/test_runtime.py           |  27 +-
 22 files changed, 686 insertions(+), 509 deletions(-)
 create mode 100644 python/ark/planner.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 0a780bcc0..20b162b16 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -228,6 +228,16 @@ void Executor::Impl::init(const std::string &plan) {
         plan_json_ = Json::parse(plan);
     }
 
+    auto gpu_manager = GpuManager::get_instance(gpu_id_);
+
+    if (!gpu_manager->info().arch->belongs_to(
+            Arch::from_name(plan_json_.at("Architecture")))) {
+        LOG(WARN, "Architecture name of the plan `",
+            plan_json_.at("Architecture").get<std::string>(),
+            "` is not compatible with the GPU architecture `",
+            gpu_manager->info().arch->name(), "`.");
+    }
+
     buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
@@ -236,17 +246,9 @@ void Executor::Impl::init(const std::string &plan) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    ModelBufferManager &buffer_manager = ModelBufferManager::get_instance();
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               name_);
 
-    if (!buffer_manager.is_empty()) {
-        codegen_ = std::make_shared<CodeGenerator>(
-            plan_json_, buffer_id_to_offset_, name, &buffer_manager);
-    } else {
-        codegen_ = std::make_shared<CodeGenerator>(plan_json_,
-                                                   buffer_id_to_offset_, name);
-    }
-
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
     buffer_ = gpu_manager->malloc(total_bytes_, 65536);
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 5c9d09f2e..14e1b7b41 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -119,6 +119,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     Json plan;
     plan["Rank"] = model_.rank();
     plan["WorldSize"] = model_.world_size();
+    plan["Architecture"] = gpu_info.arch->name();
     plan["NumProcessors"] = max_num_processors;
     plan["NumWarpsPerProcessor"] = max_num_warps;
     plan["TaskInfos"] = task_infos;
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index a97e5e45b..55327329a 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -44,7 +44,7 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::string &name, ModelBufferManager *buffer_manager);
+         const std::string &name);
     ~Impl() = default;
 
    private:
@@ -65,8 +65,6 @@ class CodeGenerator::Impl {
 
     std::string sync_process_range(const Range<size_t> &ranges, int state_id);
 
-    ModelBufferManager *buffer_manager_;
-
    protected:
     friend class CodeGenerator;
 
@@ -81,11 +79,8 @@ class CodeGenerator::Impl {
 
 CodeGenerator::Impl::Impl(const PlanJson &plan,
                           const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name,
-                          ModelBufferManager *buffer_manager)
-    : buffer_id_to_offset_(buffer_id_to_offset),
-      name_(name),
-      buffer_manager_(buffer_manager) {
+                          const std::string &name)
+    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
@@ -446,9 +441,8 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::string &name, ModelBufferManager *buffer_manager)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name,
-                                   buffer_manager)) {}
+    const std::string &name)
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index a2976e644..1ed8ec9f2 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -17,8 +17,7 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::string &name = "ark_kernel",
-                  ModelBufferManager *buffer_manager = nullptr);
+                  const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
 
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 97ce71967..86eb843e2 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -250,9 +250,13 @@ static void verify_format_processor_group(const Json &json) {
 }
 
 static void verify_format_plan(const Json &json) {
-    const std::vector<std::string> required_fields = {
-        "Rank",      "WorldSize",      "NumProcessors", "NumWarpsPerProcessor",
-        "TaskInfos", "ProcessorGroups"};
+    const std::vector<std::string> required_fields = {"Rank",
+                                                      "WorldSize",
+                                                      "Architecture",
+                                                      "NumProcessors",
+                                                      "NumWarpsPerProcessor",
+                                                      "TaskInfos",
+                                                      "ProcessorGroups"};
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
             ERR(NotFoundError, "PlanJson: " + field + " not found");
@@ -276,6 +280,7 @@ PlanJson::PlanJson(const Json &json)
     : Json((json != nullptr) ? json
                              : Json{{"Rank", 0},
                                     {"WorldSize", 1},
+                                    {"Architecture", "ANY"},
                                     {"NumProcessors", 1},
                                     {"NumWarpsPerProcessor", 1},
                                     {"TaskInfos", Json::array()},
@@ -292,6 +297,9 @@ static std::stringstream &dump_pretty_plan(const Json &json,
     dump_pretty_item(json.at("WorldSize"), "WorldSize", ss,
                      indent + indent_step)
         << ",\n";
+    dump_pretty_item(json.at("Architecture"), "Architecture", ss,
+                     indent + indent_step)
+        << ",\n";
     dump_pretty_item(json.at("NumProcessors"), "NumProcessors", ss,
                      indent + indent_step)
         << ",\n";
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
index 7b705f4c8..4baaec7fe 100644
--- a/ark/model_buffer_manager.hpp
+++ b/ark/model_buffer_manager.hpp
@@ -46,9 +46,8 @@ class ModelBufferManager {
     bool is_empty() const { return buffers_.empty(); }
 
    private:
-    std::unordered_map<size_t, std::tuple<void*, size_t>>
-        buffers_;  // Maps buffer IDs to pointers and sizes.
-    size_t next_compact_id_ = 0;
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, std::tuple<void*, size_t>> buffers_;
     ModelBufferManager() {}
     ModelBufferManager(const ModelBufferManager&) = delete;
     ModelBufferManager& operator=(const ModelBufferManager&) = delete;
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9bb83fb42..b1fd1b132 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT)
         COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true
     )
     add_custom_target(cpplint-autofix
-        COMMAND ${GIT_CLANG_FORMAT} --style=file || true
+        COMMAND ${GIT_CLANG_FORMAT} --style=file --extensions cc,cpp,h,hpp,cu,in,hip || true
     )
 else()
     message(STATUS "git-clang-format not found.")
diff --git a/docs/plan_file.md b/docs/plan_file.md
index 90a4537a2..c06ccc35d 100644
--- a/docs/plan_file.md
+++ b/docs/plan_file.md
@@ -6,6 +6,7 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
     - Rank (Int)
     - WorldSize (Int)
+    - Architecture (String)
     - NumProcessors (Int)
     - NumWarpsPerProcessor (Int)
     - TaskInfos (Array of TaskInfo)
@@ -42,6 +43,23 @@ See an example plan file: [Example 1](../examples/tutorial/default_plan.json)
 
 `ProcessorRange`, `WarpRange`, `SramRange`, and `TaskRange` are in the "range" format, i.e., `[Begin, End, Step]` that indicates an arithmetic integer sequence with a common difference of `Step`, starting from `Begin` and ends before `End` (does not include `End`). They alternatively can be in the format `[Begin, End]` that assumes `Step` is 1.
 
+## Architecture
+
+A name that refers to the hardware architecture where the plan is supposed to run over. The following names are currently supported.
+
+- `ANY`: compatible with all architectures.
+
+- NVIDIA Family
+    - `CUDA`: compatible with all supported NVIDIA architectures.
+    - `CUDA_70`: compatible with NVIDIA Volta architecture.
+    - `CUDA_80`: compatible with NVIDIA Ampere architecture.
+    - `CUDA_90`: compatible with NVIDIA Hopper architecture.
+
+- AMD Family
+    - `ROCM`: compatible with all supported AMD architectures.
+    - `ROCM_90A`: compatible with AMD CDNA 2 (GFX90A) architecture.
+    - `ROCM_942`: compatible with AMD CDNA 3 (GFX942) architecture.
+
 ## TaskInfo
 
 A `TaskInfo` object describes a sequential set of operators. The followings describe each field of `TaskInfo`.
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 585341640..71485be45 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,7 +59,7 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        rt.launch(plan_path="/mnt/changhohwang/ark/plan_gpu0.json")
+        rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json"))
 
         # Load model parameters
         if state_dict:
diff --git a/examples/tutorial/default_plan.json b/examples/tutorial/default_plan.json
index c6b4be243..bb774a5b8 100644
--- a/examples/tutorial/default_plan.json
+++ b/examples/tutorial/default_plan.json
@@ -1,36 +1,37 @@
 {
   "Rank": 0,
   "WorldSize": 1,
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
   "TaskInfos": [
     {
       "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -95,31 +96,31 @@
     },
     {
       "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 172
           }
         }
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -156,31 +157,31 @@
     },
     {
       "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
+      "NumWarps": 4,
+      "SramBytes": 24672,
       "Ops": [
         {
           "Type": "Matmul",
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
             "TransposeOther": {"BOOL":true}
           },
           "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
             "NumTasks": 64
           }
         }
@@ -189,12 +190,12 @@
   ],
   "ProcessorGroups": [
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -202,10 +203,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -215,10 +216,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -228,12 +229,12 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,172],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "ProcessorRange": [0,172],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
           ]
@@ -241,10 +242,10 @@
       ]
     },
     {
-      "ProcessorRange": [0,108],
+      "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,108],
+          "ProcessorRange": [0,304],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
@@ -258,8 +259,8 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
             {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
           ]
@@ -267,4 +268,4 @@
       ]
     }
   ]
-}
+}
\ No newline at end of file
diff --git a/examples/tutorial/model.json b/examples/tutorial/model.json
index 1bc9233a5..a6ba8e8be 100644
--- a/examples/tutorial/model.json
+++ b/examples/tutorial/model.json
@@ -12,14 +12,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -31,13 +31,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         },
@@ -46,14 +46,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         }
@@ -69,14 +69,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -95,14 +95,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {}
         },
@@ -111,14 +111,14 @@
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan.json
+++ b/examples/tutorial/plan.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json
index 3a3f66530..04d2e9d60 100644
--- a/examples/tutorial/plan_1_larger_tile.json
+++ b/examples/tutorial/plan_1_larger_tile.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json
index 493515d8c..837944171 100644
--- a/examples/tutorial/plan_2_split_k.json
+++ b/examples/tutorial/plan_2_split_k.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json
index c0854e505..335c27549 100644
--- a/examples/tutorial/plan_3_overwrite.json
+++ b/examples/tutorial/plan_3_overwrite.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "CUDA_80",
   "NumProcessors": 108,
   "NumWarpsPerProcessor": 8,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,13 +47,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -74,14 +75,14 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -103,14 +104,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
+            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -135,14 +136,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
+            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
+            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
           ],
           "Args": {},
           "Config": {
@@ -164,14 +165,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
+            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
+            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
           ],
           "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -196,14 +197,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
+            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
+            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
           ],
           "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,14 +229,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
+            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
+            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
           ],
           "Args": {},
           "Config": {
diff --git a/examples/tutorial/plan_tutorial.py b/examples/tutorial/plan_tutorial.py
index 056523e15..989f29c5e 100644
--- a/examples/tutorial/plan_tutorial.py
+++ b/examples/tutorial/plan_tutorial.py
@@ -339,7 +339,7 @@ def main(plan_path: str):
 
         plan = planner.plan()
         with open("default_plan.json", "w") as f:
-            f.write(plan)
+            f.write(str(plan))
         rt.launch(plan=plan)
 
         # Initialize
@@ -364,7 +364,7 @@ def main(plan_path: str):
         print(f"File {plan_path} does not exist. Exiting...")
         return
     with ark.Runtime.get_runtime() as rt:
-        rt.launch(plan_path=plan_path)
+        rt.launch(plan=ark.Plan.from_file(plan_path))
 
         # Initialize
         InputModule.initialize()
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 49b6bdd98..63c1943e3 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -1,6 +1,7 @@
 {
   "Rank": 0,
   "WorldSize": 1,
+  "Architecture": "ROCM_942",
   "NumProcessors": 304,
   "NumWarpsPerProcessor": 4,
   "TaskInfos": [
@@ -14,14 +15,14 @@
           "Name": "matmul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -46,14 +47,14 @@
           "Name": "rope",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -75,13 +76,13 @@
           "Name": "transpose",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -105,14 +106,14 @@
           "Name": "matmul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -137,14 +138,14 @@
           "Name": "rope_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -166,13 +167,13 @@
           "Name": "transpose_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,3,1]}
@@ -196,14 +197,14 @@
           "Name": "matmul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -228,13 +229,13 @@
           "Name": "transpose_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -258,14 +259,14 @@
           "Name": "matmul_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -290,13 +291,13 @@
           "Name": "mul",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Factor": {"FLOAT":0.0883883461356163}
@@ -320,13 +321,13 @@
           "Name": "reduce_max",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -351,14 +352,14 @@
           "Name": "sub",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -380,13 +381,13 @@
           "Name": "exp",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -408,13 +409,13 @@
           "Name": "reduce_sum",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -439,14 +440,14 @@
           "Name": "div",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -468,14 +469,14 @@
           "Name": "matmul_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -500,13 +501,13 @@
           "Name": "transpose_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -530,14 +531,14 @@
           "Name": "matmul_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -562,13 +563,13 @@
           "Name": "cast",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -590,14 +591,14 @@
           "Name": "mul_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -619,13 +620,13 @@
           "Name": "reduce_mean",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":2},
@@ -650,13 +651,13 @@
           "Name": "rsqrt",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -678,14 +679,14 @@
           "Name": "mul_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -707,14 +708,14 @@
           "Name": "mul_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -736,13 +737,13 @@
           "Name": "cast_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -764,14 +765,14 @@
           "Name": "matmul_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -796,14 +797,14 @@
           "Name": "rope_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -825,13 +826,13 @@
           "Name": "transpose_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -855,14 +856,14 @@
           "Name": "matmul_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -887,14 +888,14 @@
           "Name": "rope_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -916,13 +917,13 @@
           "Name": "transpose_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,3,1]}
@@ -946,14 +947,14 @@
           "Name": "matmul_8",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -978,13 +979,13 @@
           "Name": "transpose_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -1008,14 +1009,14 @@
           "Name": "matmul_9",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1040,13 +1041,13 @@
           "Name": "mul_4",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Factor": {"FLOAT":0.0883883461356163}
@@ -1070,13 +1071,13 @@
           "Name": "reduce_max_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -1101,14 +1102,14 @@
           "Name": "sub_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1130,13 +1131,13 @@
           "Name": "exp_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1158,13 +1159,13 @@
           "Name": "reduce_sum_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":3},
@@ -1189,14 +1190,14 @@
           "Name": "div_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1218,14 +1219,14 @@
           "Name": "matmul_10",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1250,13 +1251,13 @@
           "Name": "transpose_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Permutation": {"DIMS":[0,2,1,3]}
@@ -1280,14 +1281,14 @@
           "Name": "matmul_11",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1312,14 +1313,14 @@
           "Name": "add",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1341,13 +1342,13 @@
           "Name": "cast_2",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1369,14 +1370,14 @@
           "Name": "mul_5",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1398,13 +1399,13 @@
           "Name": "reduce_mean_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "Axis": {"INT":2},
@@ -1429,13 +1430,13 @@
           "Name": "rsqrt_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1457,14 +1458,14 @@
           "Name": "mul_6",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1486,14 +1487,14 @@
           "Name": "mul_7",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1515,13 +1516,13 @@
           "Name": "cast_3",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1543,14 +1544,14 @@
           "Name": "matmul_12",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1575,13 +1576,13 @@
           "Name": "sigmoid",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1603,14 +1604,14 @@
           "Name": "mul_8",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1632,14 +1633,14 @@
           "Name": "matmul_13",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1664,14 +1665,14 @@
           "Name": "mul_9",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
@@ -1693,14 +1694,14 @@
           "Name": "matmul_14",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
@@ -1725,14 +1726,14 @@
           "Name": "add_1",
           "IsVirtual": false,
           "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[]}}
+            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {},
           "Config": {
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index f2f604be9..e96972906 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -37,7 +37,7 @@ def set_world_size(world_size):
 from .init import init
 from .tensor import Dims, Tensor, Parameter
 from .module import Module, RuntimeModule
-from .runtime import Runtime, DefaultPlanner
+from .runtime import Runtime
 from .serialize import save, load
 from .data_type import (
     DataType,
@@ -100,4 +100,5 @@ def set_world_size(world_size):
     GpuError,
     RuntimeError,
 )
+from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
diff --git a/python/ark/planner.py b/python/ark/planner.py
new file mode 100644
index 000000000..8814896d2
--- /dev/null
+++ b/python/ark/planner.py
@@ -0,0 +1,184 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import copy
+import json
+from typing import Callable, Dict, List, Any
+
+from ._ark_core import _DefaultPlanner
+from .model import Model
+
+
+def idnt(indent):
+    return " " * indent
+
+
+def dquote(s):
+    return '"' + s + '"'
+
+
+def denser_json_obj(obj, key, level, indent, indent_step, ret=""):
+    if len(obj) == 0:
+        if key:
+            return ret + idnt(indent) + dquote(key) + ": {}"
+        else:
+            return ret + idnt(indent) + "{}"
+    ret += idnt(indent)
+    if key:
+        ret += dquote(key) + ": {\n"
+    else:
+        ret += "{\n"
+    num_item = len(obj)
+    for k, v in obj.items():
+        is_obj_or_arr = isinstance(v, dict) or isinstance(v, list)
+        is_num_arr = isinstance(v, list) and v and isinstance(v[0], int)
+        if level <= 0 or not is_obj_or_arr or is_num_arr:
+            ret += (
+                idnt(indent + indent_step)
+                + dquote(k)
+                + ": "
+                + json.dumps(v, separators=(",", ":"))
+            )
+        elif isinstance(v, dict):
+            ret += denser_json_obj(
+                v, k, level - 1, indent + indent_step, indent_step
+            )
+        elif isinstance(v, list):
+            ret += denser_json_arr(
+                v, k, level - 1, indent + indent_step, indent_step
+            )
+        num_item -= 1
+        if num_item > 0:
+            ret += ",\n"
+        else:
+            ret += "\n"
+    ret += idnt(indent) + "}"
+    return ret
+
+
+def denser_json_arr(obj, key, level, indent, indent_step, ret=""):
+    if len(obj) == 0:
+        if key:
+            return ret + idnt(indent) + dquote(key) + ": []"
+        else:
+            return ret + idnt(indent) + "[]"
+    ret += idnt(indent)
+    if key:
+        ret += dquote(key) + ": [\n"
+    else:
+        ret += "[\n"
+    num_item = len(obj)
+    for v in obj:
+        is_obj_or_arr = isinstance(v, dict) or isinstance(v, list)
+        is_num_arr = (
+            isinstance(v, list)
+            and v
+            and (isinstance(v[0], int) or isinstance(v[0], float))
+        )
+        if level <= 0 or not is_obj_or_arr or is_num_arr:
+            ret += idnt(indent + indent_step) + json.dumps(
+                v, separators=(",", ":")
+            )
+        elif isinstance(v, dict):
+            ret += denser_json_obj(
+                v, "", level - 1, indent + indent_step, indent_step
+            )
+        elif isinstance(v, list):
+            ret += denser_json_arr(
+                v, "", level - 1, indent + indent_step, indent_step
+            )
+        num_item -= 1
+        if num_item > 0:
+            ret += ",\n"
+        else:
+            ret += "\n"
+    ret += idnt(indent) + "]"
+    return ret
+
+
+def denser_json(obj, level, indent_step=2):
+    if isinstance(obj, dict):
+        return denser_json_obj(obj, "", level, 0, indent_step, "")
+    elif isinstance(obj, list):
+        return denser_json_arr(obj, "", level, 0, indent_step, "")
+    return json.dumps(obj, indent=indent_step)
+
+
+class Plan:
+    def __init__(self, plan: Dict[str, Any]):
+        if plan is None:
+            plan = {}
+            plan["Rank"] = 0
+            plan["WorldSize"] = 1
+            plan["Architecture"] = "ANY"
+            plan["NumProcessors"] = 1
+            plan["NumWarpsPerProcessor"] = 1
+            plan["TaskInfos"] = []
+            plan["ProcessorGroups"] = []
+        else:
+            plan = copy.deepcopy(plan)
+        self.plan = plan
+
+    def __str__(self) -> str:
+        return denser_json(self.plan, 5)
+
+    @property
+    def rank(self) -> int:
+        return self.plan["Rank"]
+
+    @property
+    def world_size(self) -> int:
+        return self.plan["WorldSize"]
+
+    @property
+    def architecture(self) -> str:
+        return self.plan["Architecture"]
+
+    @property
+    def num_processors(self) -> int:
+        return self.plan["NumProcessors"]
+
+    @property
+    def num_warps_per_processor(self) -> int:
+        return self.plan["NumWarpsPerProcessor"]
+
+    @property
+    def task_infos(self) -> List[Dict[str, Any]]:
+        return self.plan["TaskInfos"]
+
+    @property
+    def processor_groups(self) -> List[Dict[str, Any]]:
+        return self.plan["ProcessorGroups"]
+
+    @staticmethod
+    def from_str(plan_str: str) -> "Plan":
+        plan = json.loads(plan_str)
+        return Plan(plan)
+
+    @staticmethod
+    def from_file(file_path: str) -> "Plan":
+        with open(file_path, "r") as f:
+            plan = json.load(f)
+        return Plan(plan)
+
+
+class DefaultPlanner(_DefaultPlanner):
+    def __init__(self, device_id: int = 0):
+        compressed = Model.get_model().compress()
+        super().__init__(compressed, device_id)
+
+    def install_config_rule(self, rule: Callable[[str, str], str]):
+        """
+        Install a configuration rule.
+
+        Args:
+            rule: A function that takes an operator description and a target
+            architecture name and returns a configuration description.
+        """
+        super().install_config_rule(rule)
+
+    def plan(self) -> Plan:
+        """
+        Generate an execution plan.
+        """
+        return Plan.from_str(super().plan(pretty=False))
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index b959ceb18..feb78e0de 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -1,30 +1,36 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import json
 import sys
 import time
+
 from .runtime import Runtime
+from .planner import Plan
 
 
 class Profiler:
-    def __init__(self, plan: str):
-        self.plan = json.loads(plan)
+    def __init__(self, plan: Plan):
+        self.plan = plan
 
     def run(self):
-        num_processor_groups = len(self.plan["ProcessorGroups"])
+        num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
-            "Rank": self.plan["Rank"], "WorldSize": self.plan["WorldSize"],
-            "NumProcessors": self.plan["NumProcessors"],
-            "NumWarpsPerProcessor": self.plan["NumWarpsPerProcessor"],
-            "TaskInfos": self.plan["TaskInfos"],
-            "ProcessorGroups": [{}]}
+            "Rank": self.plan.rank,
+            "WorldSize": self.plan.world_size,
+            "Architecture": self.plan.architecture,
+            "NumProcessors": self.plan.num_processors,
+            "NumWarpsPerProcessor": self.plan.num_warps_per_processor,
+            "TaskInfos": self.plan.task_infos,
+            "ProcessorGroups": [None],
+        }
         for i in range(num_processor_groups):
-            new_plan["ProcessorGroups"][0] = self.plan["ProcessorGroups"][i]
+            new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
             with Runtime() as rt:
-                rt.launch(plan=json.dumps(new_plan))
+                rt.launch(plan=str(new_plan))
                 start_time = time.time()
                 iter = 1000
                 rt.run(iter=iter)
                 end_time = time.time()
-                sys.stderr.write(f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n")
+                sys.stderr.write(
+                    f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n"
+                )
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index efae6ab3c..40bfaaa63 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,10 +3,11 @@
 
 import logging
 from enum import Enum
-from typing import Callable, Dict, List
+from typing import Dict, List
 
-from ._ark_core import _Executor, _DefaultPlanner
+from ._ark_core import _Executor
 from .model import Model
+from .planner import DefaultPlanner, Plan
 
 
 class _RuntimeState:
@@ -46,33 +47,9 @@ def print_runtime_states():
             print(f"{runtime_id:<12} | {runtime.state:<20}")
 
 
-class DefaultPlanner(_DefaultPlanner):
-    def __init__(self, gpu_id: int = 0):
-        compressed = Model.get_model().compress()
-        super().__init__(compressed, gpu_id)
-
-    def install_config_rule(self, rule: Callable[[str, str], str]):
-        """
-        Install a configuration rule.
-
-        Args:
-            rule: A function that takes an operator description and a target
-            architecture name and returns a configuration description.
-        """
-        super().install_config_rule(rule)
-
-    def plan(self, pretty: bool = True) -> str:
-        """
-        Generate an execution plan.
-
-        Args:
-            pretty: Whether to generate a pretty plan.
-        """
-        return super().plan(pretty)
-
-
 class Executor(_Executor):
-    pass
+    def __init__(self, plan: Plan, device_id: int, name: str):
+        super().__init__(plan.rank, plan.world_size, device_id, name, str(plan))
 
 
 class Runtime:
@@ -155,11 +132,8 @@ def running(self) -> bool:
 
     def launch(
         self,
-        rank: int = 0,
-        world_size: int = 1,
-        gpu_id: int = 0,
-        plan: str = "",
-        plan_path: str = "",
+        plan: Plan = None,
+        device_id: int = 0,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -172,11 +146,7 @@ def launch(
             )
             return
         if not plan:
-            if not plan_path:
-                plan = DefaultPlanner(gpu_id).plan()
-            else:
-                with open(plan_path, "r") as f:
-                    plan = f.read()
+            plan = DefaultPlanner(device_id).plan()
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
@@ -187,11 +157,9 @@ def launch(
                     )
                     self.executor.destroy()
             self.executor = Executor(
-                rank,
-                world_size,
-                gpu_id,
-                "ArkRuntime",
                 plan,
+                device_id,
+                "ArkRuntime",
             )
             self.executor.compile()
         self.executor.launch()
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index fd34bb96b..b075c64ea 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -2,18 +2,9 @@
 # Licensed under the MIT license.
 
 import ark
-import json
 
-empty_plan = json.dumps(
-    {
-        "Rank": 0,
-        "WorldSize": 1,
-        "NumProcessors": 1,
-        "NumWarpsPerProcessor": 1,
-        "TaskInfos": [],
-        "ProcessorGroups": [],
-    }
-)
+
+empty_plan = ark.Plan(None)
 
 
 def test_runtime_relaunch():
@@ -35,7 +26,7 @@ def test_multiple_runtime_launch():
     for i in range(num_runtimes):
         rt = ark.Runtime.get_runtime(i)
         assert rt.launched() == False
-        rt.launch(gpu_id=i, plan=empty_plan)
+        rt.launch(plan=empty_plan, device_id=i)
         assert rt.launched() == True
     for i in range(num_runtimes):
         rt = ark.Runtime.get_runtime(i)
@@ -46,9 +37,9 @@ def test_multiple_runtime_launch():
 def test_stop_runtime():
     ark.init()
     rt1 = ark.Runtime.get_runtime(1)
-    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt1.launch(plan=empty_plan, device_id=1)
     rt2 = ark.Runtime.get_runtime(2)
-    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt2.launch(plan=empty_plan, device_id=2)
     rt1.stop()
     rt1.reset()
     assert rt1.state == ark.Runtime.State.Init
@@ -59,9 +50,9 @@ def test_stop_runtime():
 def test_reset_runtime():
     ark.init()
     rt1 = ark.Runtime.get_runtime(0)
-    rt1.launch(plan=empty_plan, gpu_id=1)
+    rt1.launch(plan=empty_plan, device_id=1)
     rt2 = ark.Runtime.get_runtime(1)
-    rt2.launch(plan=empty_plan, gpu_id=2)
+    rt2.launch(plan=empty_plan, device_id=2)
     rt1.reset()
     assert rt1.launched() == False
     assert rt2.launched() == True
@@ -77,7 +68,7 @@ def test_multiple_runtimes_complex():
     default_runtime = ark.Runtime.get_runtime()
     runtime_list.append(default_runtime)
     for i, rt in enumerate(runtime_list):
-        rt.launch(plan=empty_plan, gpu_id=i)
+        rt.launch(plan=empty_plan, device_id=i)
         assert rt.launched() == True
     runtime_list[0].stop()
     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
@@ -87,7 +78,7 @@ def test_multiple_runtimes_complex():
     assert runtime_list[1].state == ark.Runtime.State.Init
     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-    runtime_list[1].launch(plan=empty_plan, gpu_id=1)
+    runtime_list[1].launch(plan=empty_plan, device_id=1)
     for rt in runtime_list:
         assert rt.launched() == True
     ark.Runtime.delete_all_runtimes()

From 0cb10b92c601306d537eb3de6259cf73e59b33df Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 3 Jul 2024 07:58:34 +0000
Subject: [PATCH 022/106] fix a reduction perf bug

---
 ark/include/kernels/reduce.h | 18 +++++++++---------
 plan_gpu0.json               | 36 ++++++++++++++++++------------------
 python/ark/profiler.py       | 24 +++++++++++++++---------
 3 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 30c8b7831..3d0b4e008 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) {
 template <typename ReduceType, typename UnitOp, int LanesNum, typename DataType>
 DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) {
     val = warpReduce<ReduceType, LanesNum>(val);
-    if (LanesNum > Arch::ThreadsPerWarp) {
+    if constexpr (LanesNum > Arch::ThreadsPerWarp) {
         ReduceSharedStorage<DataType> *shared =
             UnitOp::template shared_memory<ReduceSharedStorage<DataType>>(
                 smem_per_warp);
@@ -351,8 +351,8 @@ struct WwiseReduce {
     /// @param in Input tensor.
     /// @param uop_idx Index of the unit operator.
     template <typename DataType>
-    static DEVICE void runW(DataType *out, DataType *in, int uop_idx,
-                            int smem_per_warp) {
+    static DEVICE void run(DataType *out, DataType *in, int uop_idx,
+                           int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
         constexpr int NelemPerThread =
@@ -450,8 +450,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_sum(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeSum, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -460,8 +460,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_mean(DataType *out, DataType *in, int uop_idx,
                           int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMean, Axis>::runW(out, in, uop_idx,
-                                                       smem_per_warp);
+                SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx,
+                                                      smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -470,8 +470,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_max(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMax, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 }  // namespace ark
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 63c1943e3..99e2da8fa 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -314,7 +314,7 @@
     {
       "Id": 10,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -336,7 +336,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -402,7 +402,7 @@
     {
       "Id": 13,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceSum",
@@ -424,7 +424,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -613,7 +613,7 @@
     {
       "Id": 20,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -635,7 +635,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -1064,7 +1064,7 @@
     {
       "Id": 35,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -1086,7 +1086,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -1152,7 +1152,7 @@
     {
       "Id": 38,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceSum",
@@ -1174,7 +1174,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
         }
@@ -1392,7 +1392,7 @@
     {
       "Id": 46,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -1414,7 +1414,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -1883,7 +1883,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -1922,7 +1922,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2013,7 +2013,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
           ]
@@ -2208,7 +2208,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2247,7 +2247,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
           ]
@@ -2351,7 +2351,7 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,256],
+          "SramRange": [0,0],
           "TaskGroups": [
             {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
           ]
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index feb78e0de..529a0d506 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,11 +8,22 @@
 from .planner import Plan
 
 
+def timeit(plan: Plan):
+    with Runtime() as rt:
+        rt.launch(plan=plan)
+        start_time = time.time()
+        iter = 1000
+        rt.run(iter=iter)
+        end_time = time.time()
+        return (end_time - start_time) / iter
+
+
 class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
     def run(self):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n")
         num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
             "Rank": self.plan.rank,
@@ -25,12 +36,7 @@ def run(self):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            with Runtime() as rt:
-                rt.launch(plan=str(new_plan))
-                start_time = time.time()
-                iter = 1000
-                rt.run(iter=iter)
-                end_time = time.time()
-                sys.stderr.write(
-                    f"Processor group {i} runtime: {(end_time - start_time)/iter:.6f} seconds/iter\n"
-                )
+            lat_per_iter = timeit(Plan(new_plan))
+            sys.stderr.write(
+                f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
+            )

From 0fde9c5dc486ba1edb20235115575d360558ece9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 4 Jul 2024 07:17:32 +0000
Subject: [PATCH 023/106] optimize

---
 ark/include/kernels/common/sync.h |  12 +--
 ark/ops/ops_broadcast.cpp         |   4 +-
 examples/llama/model_test.py      |   2 +-
 plan_gpu0.json                    | 172 ++++++++----------------------
 4 files changed, 51 insertions(+), 139 deletions(-)

diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index 85f7639c9..f47625600 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,25 +106,19 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
-    } else if constexpr (NumWarps == 16) {
-        __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
-        int lane_id = threadIdx.x & 63;
-        if (lane_id == 0) {
+        if ((threadIdx.x & 63) == 0) {
             constexpr int MaxOldCnt = NumWarps - 1;
-            int warp_id = threadIdx.x >> 6;
-            int group_id = warp_id / NumWarps;
+            int group_id = (threadIdx.x >> 6) / NumWarps;
             sync::WarpGroupState *state =
                 reinterpret_cast<sync::WarpGroupState *>(_ARK_SMEM);
             unsigned int tmp = state->is_inc_flag[group_id] ^ 1;
             if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) {
                 state->flag[group_id] = tmp;
             } else {
-                while (atomicAdd(&state->flag[group_id], 0) != tmp)
-                    __builtin_amdgcn_s_sleep(1);
-                __asm__ __volatile__("s_wakeup");
+                while (atomicAdd(&state->flag[group_id], 0) != tmp);
             }
             state->is_inc_flag[group_id] = tmp;
         }
diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp
index 3985a0500..f20e8c4dc 100644
--- a/ark/ops/ops_broadcast.cpp
+++ b/ark/ops/ops_broadcast.cpp
@@ -27,8 +27,8 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name,
 std::string ModelOpBroadcast1::impl_name(const Json &config) const {
     check_fields_config(config, {"NumWarps", "Tile"});
     int num_warps = config.at("NumWarps");
-    auto &tile_shape = config.at("Tile");
-    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
+    const auto& tile_shape = config.at("Tile").get<std::vector<DimType>>();
+    Dims unit_out_dims(tile_shape);
 
     return function_name_string(
         pascal_to_snake(type()->type_name()),
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 71485be45..053015c04 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -473,7 +473,7 @@ def test_transformer_block(
         module_name_prefix="layers.0",
         rank=rank,
         world_size=world_size,
-        test_thru=True,
+        test_thru=False,
     )
 
 
diff --git a/plan_gpu0.json b/plan_gpu0.json
index 99e2da8fa..cad05f774 100644
--- a/plan_gpu0.json
+++ b/plan_gpu0.json
@@ -31,7 +31,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -39,7 +39,7 @@
     },
     {
       "Id": 1,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -58,17 +58,17 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [32,128],
-            "NumTasks": 2048
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
     },
     {
       "Id": 2,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -88,10 +88,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -122,7 +122,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -130,7 +130,7 @@
     },
     {
       "Id": 4,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -149,17 +149,17 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [32,128],
-            "NumTasks": 2048
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
     },
     {
       "Id": 5,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -170,19 +170,19 @@
             {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
+            "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -213,7 +213,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
         }
@@ -221,7 +221,7 @@
     },
     {
       "Id": 7,
-      "NumWarps": 1,
+      "NumWarps": 4,
       "SramBytes": 0,
       "Ops": [
         {
@@ -241,10 +241,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -260,7 +260,7 @@
           "IsVirtual": false,
           "ReadTensors": [
             {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
             {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
@@ -270,12 +270,12 @@
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
+            "TransposeOther": {"BOOL":true}
           },
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 4096
           }
         }
@@ -305,7 +305,7 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [128,256],
+            "Tile": [256,128],
             "NumTasks": 4096
           }
         }
@@ -1747,119 +1747,36 @@
     }
   ],
   "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,2048],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
+          "ProcessorRange": [0,86],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":0,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":1,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":2,"TaskRange":[0,256],"Granularity":1}
           ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
+        },
         {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
+          "ProcessorRange": [86,172],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,256],"Granularity":1}
+            {"TaskId":3,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":4,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":5,"TaskRange":[0,256],"Granularity":1}
           ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,8192],"Granularity":4}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
+        },
         {
-          "ProcessorRange": [0,304],
+          "ProcessorRange": [172,258],
           "WarpRange": [0,4],
           "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1}
+            {"TaskId":6,"TaskRange":[0,256],"Granularity":1},
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
           ]
         }
       ]
@@ -1870,8 +1787,9 @@
         {
           "ProcessorRange": [0,304],
           "WarpRange": [0,4],
-          "SramRange": [0,0],
+          "SramRange": [0,24672],
           "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1},
             {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
           ]
         }

From c4be6d1bf7b7fcacdd11dd3efad7b4170461ce41 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 5 Jul 2024 00:14:05 +0000
Subject: [PATCH 024/106] wip

---
 ark/codegen.cpp                             |    6 +-
 arkprof.py                                  |    4 +
 examples/llama/model_test.py                |   23 +-
 examples/llama/plan_llama2_7b_b1_s2048.json | 1723 +++++++++++++++++++
 python/ark/profiler.py                      |   12 +-
 5 files changed, 1751 insertions(+), 17 deletions(-)
 create mode 100644 arkprof.py
 create mode 100644 examples/llama/plan_llama2_7b_b1_s2048.json

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 55327329a..587bcae59 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -298,10 +298,14 @@ std::string CodeGenerator::Impl::resource_group(
     size_t proc_b = *rg_proc_range.begin();
     size_t proc_e = *rg_proc_range.end();
     size_t proc_s = rg_proc_range.step();
+    std::map<size_t, Json> task_infos_map;
+    for (auto &task_info : task_infos) {
+        task_infos_map[task_info.at("Id").get<size_t>()] = task_info;
+    }
     std::stringstream ss;
     for (auto &tg : rg_json["TaskGroups"]) {
         size_t task_id = tg["TaskId"];
-        auto &task_info = task_infos[task_id];
+        auto &task_info = task_infos_map.at(task_id);
         Range<size_t> task_range(tg["TaskRange"][0], tg["TaskRange"][1]);
         size_t task_gran = tg["Granularity"];
         size_t num_warps_per_task = task_info["NumWarps"];
diff --git a/arkprof.py b/arkprof.py
new file mode 100644
index 000000000..782bba560
--- /dev/null
+++ b/arkprof.py
@@ -0,0 +1,4 @@
+import ark
+import sys
+
+ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False)
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 053015c04..19c680854 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,7 +59,8 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        rt.launch(ark.Plan.from_file("/mnt/changhohwang/ark/plan_gpu0.json"))
+        plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json")
+        rt.launch(plan)
 
         # Load model parameters
         if state_dict:
@@ -438,22 +439,22 @@ def test_transformer_block(
         low=-1, high=1, size=(batch_size, seq_len, args.dim)
     ).astype(dtype)
 
-    module = model_ark.Attention(
-        args, ark.DataType.from_numpy(dtype), rank, world_size
-    )
+    # module = model_ark.Attention(
+    #     args, ark.DataType.from_numpy(dtype), rank, world_size
+    # )
     # module_inputs = [
     #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
     #     if isinstance(i, np.ndarray)
     #     else i
     #     for i in inputs
     # ]
-    feature_tensor = ark.tensor(
-        list(feature.shape), ark.DataType.from_numpy(feature.dtype)
-    )
-    freqs_cis_ark_tensor = ark.tensor(
-        list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
-    )
-    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
+    # feature_tensor = ark.tensor(
+    #     list(feature.shape), ark.DataType.from_numpy(feature.dtype)
+    # )
+    # freqs_cis_ark_tensor = ark.tensor(
+    #     list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
+    # )
+    # output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
 
     # print(ark.Model.get_model().serialize())
 
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
new file mode 100644
index 000000000..d0e46d228
--- /dev/null
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -0,0 +1,1723 @@
+{
+  "Rank": 0,
+  "WorldSize": 1,
+  "Architecture": "ROCM_942",
+  "NumProcessors": 304,
+  "NumWarpsPerProcessor": 4,
+  "TaskInfos": [
+    {
+      "Id": 0,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":11,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 1,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":13,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 2,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":15,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 3,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":17,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 4,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":19,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 5,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":7,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 6,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":22,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 7,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":24,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":25,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 8,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":30,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":33,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 9,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 10,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":26,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":27,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 11,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rope",
+          "Name": "rope_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":31,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":35,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 12,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,3,1]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 13,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":28,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":29,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 14,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 15,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 4096
+          }
+        }
+      ]
+    },
+    {
+      "Id": 16,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "ScalarMul",
+          "Name": "mul_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":45,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Factor": {"FLOAT":0.0883883461356163}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 17,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMax",
+          "Name": "reduce_max",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":47,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 18,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sub",
+          "Name": "sub",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 19,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Exp",
+          "Name": "exp",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 20,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceSum",
+          "Name": "reduce_sum",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":51,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":3},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 65536
+          }
+        }
+      ]
+    },
+    {
+      "Id": 21,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Div",
+          "Name": "div",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 2097152
+          }
+        }
+      ]
+    },
+    {
+      "Id": 22,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":54,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":false}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [256,128,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 23,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Transpose",
+          "Name": "transpose_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Permutation": {"DIMS":[0,2,1,3]}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [8,8],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 24,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":58,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":59,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 25,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":61,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 26,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_2",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 27,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_4",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":65,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 28,
+      "NumWarps": 1,
+      "SramBytes": 256,
+      "Ops": [
+        {
+          "Type": "ReduceMean",
+          "Name": "reduce_mean_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":67,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "Axis": {"INT":2},
+            "KeepDim": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 1,
+            "ImplType": "WarpWise",
+            "SramBytes": 256,
+            "NumTasks": 2048
+          }
+        }
+      ]
+    },
+    {
+      "Id": 29,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Rsqrt",
+          "Name": "rsqrt_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":69,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [64,1],
+            "NumTasks": 32
+          }
+        }
+      ]
+    },
+    {
+      "Id": 30,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_5",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":71,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 31,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":8,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 32,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Cast",
+          "Name": "cast_3",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":74,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    },
+    {
+      "Id": 33,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_6",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":4,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":76,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 34,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Sigmoid",
+          "Name": "sigmoid",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":78,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 35,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":80,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 36,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":82,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 688
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 352256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 38,
+      "NumWarps": 4,
+      "SramBytes": 24672,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":5,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":86,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 24672,
+            "TileShapeMNK": [128,256,32],
+            "NumTasks": 256
+          }
+        }
+      ]
+    },
+    {
+      "Id": 39,
+      "NumWarps": 1,
+      "SramBytes": 0,
+      "Ops": [
+        {
+          "Type": "Add",
+          "Name": "add_1",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":88,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":89,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 1,
+            "SramBytes": 0,
+            "Tile": [1,64],
+            "NumTasks": 131072
+          }
+        }
+      ]
+    }
+  ],
+  "ProcessorGroups": [
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":0,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":1,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":2,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":4,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":5,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":6,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":8,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":9,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":10,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":11,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":12,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":13,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":14,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":15,"TaskRange":[0,4096],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":20,"TaskRange":[0,65536],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":23,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":25,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":27,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,256],
+          "TaskGroups": [
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,32],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,32],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":29,"TaskRange":[0,32],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":31,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":32,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":34,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":35,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":36,"TaskRange":[0,688],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":37,"TaskRange":[0,352256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,256],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
+          "TaskGroups": [
+            {"TaskId":38,"TaskRange":[0,256],"Granularity":1}
+          ]
+        }
+      ]
+    },
+    {
+      "ProcessorRange": [0,304],
+      "ResourceGroups": [
+        {
+          "ProcessorRange": [0,304],
+          "WarpRange": [0,1],
+          "SramRange": [0,0],
+          "TaskGroups": [
+            {"TaskId":39,"TaskRange":[0,131072],"Granularity":1}
+          ]
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index 529a0d506..56233247c 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,11 +8,10 @@
 from .planner import Plan
 
 
-def timeit(plan: Plan):
+def timeit(plan: Plan, iter: int):
     with Runtime() as rt:
         rt.launch(plan=plan)
         start_time = time.time()
-        iter = 1000
         rt.run(iter=iter)
         end_time = time.time()
         return (end_time - start_time) / iter
@@ -22,8 +21,11 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan):.6f} seconds/iter\n")
+    def run(self, iter: int = 1000, profile_processor_groups: bool = False):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n")
+
+        if not profile_processor_groups:
+            return
         num_processor_groups = len(self.plan.processor_groups)
         new_plan = {
             "Rank": self.plan.rank,
@@ -36,7 +38,7 @@ def run(self):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            lat_per_iter = timeit(Plan(new_plan))
+            lat_per_iter = timeit(Plan(new_plan), iter)
             sys.stderr.write(
                 f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
             )

From cc30912486c24f71617ee2200c7429ea2e610d51 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 5 Jul 2024 07:12:49 +0000
Subject: [PATCH 025/106] optimization

---
 examples/llama/plan_llama2_7b_b1_s2048.json | 732 ++++----------------
 1 file changed, 126 insertions(+), 606 deletions(-)

diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index d0e46d228..15b0de2d0 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -27,17 +27,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul",
@@ -56,17 +49,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 1,
-      "SramBytes": 256,
-      "Ops": [
+        },
         {
           "Type": "ReduceMean",
           "Name": "reduce_mean",
@@ -87,7 +73,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -144,17 +130,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_2",
@@ -173,17 +152,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_1",
@@ -201,8 +173,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
         }
       ]
@@ -233,17 +205,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 8,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Rope",
           "Name": "rope",
@@ -260,19 +225,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 9,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose",
@@ -290,10 +248,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -324,17 +282,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 11,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Rope",
           "Name": "rope_1",
@@ -351,19 +302,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 12,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_2",
@@ -372,19 +316,19 @@
             {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
+            "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -415,17 +359,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 14,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_1",
@@ -443,10 +380,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -462,7 +399,7 @@
           "IsVirtual": false,
           "ReadTensors": [
             {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":42,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
           ],
           "WriteTensors": [
             {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
@@ -472,22 +409,15 @@
           ],
           "Args": {
             "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
+            "TransposeOther": {"BOOL":true}
           },
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 4096
           }
-        }
-      ]
-    },
-    {
-      "Id": 16,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "ScalarMul",
           "Name": "mul_3",
@@ -505,10 +435,10 @@
             "Factor": {"FLOAT":0.0883883461356163}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [256,128],
+            "NumTasks": 4096
           }
         }
       ]
@@ -516,7 +446,7 @@
     {
       "Id": 17,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMax",
@@ -538,17 +468,10 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 18,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Sub",
           "Name": "sub",
@@ -567,17 +490,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 19,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Exp",
           "Name": "exp",
@@ -595,17 +511,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 20,
-      "NumWarps": 1,
-      "SramBytes": 256,
-      "Ops": [
+        },
         {
           "Type": "ReduceSum",
           "Name": "reduce_sum",
@@ -626,17 +535,10 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 65536
           }
-        }
-      ]
-    },
-    {
-      "Id": 21,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Div",
           "Name": "div",
@@ -655,8 +557,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 2097152
+            "Tile": [1,2048],
+            "NumTasks": 65536
           }
         }
       ]
@@ -690,14 +592,7 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 23,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Transpose",
           "Name": "transpose_3",
@@ -715,10 +610,10 @@
             "Permutation": {"DIMS":[0,2,1,3]}
           },
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
+            "Tile": [256,1,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -749,17 +644,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 25,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Add",
           "Name": "add",
@@ -776,19 +664,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 26,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_2",
@@ -804,19 +685,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 27,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_4",
@@ -833,10 +707,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -844,7 +718,7 @@
     {
       "Id": 28,
       "NumWarps": 1,
-      "SramBytes": 256,
+      "SramBytes": 0,
       "Ops": [
         {
           "Type": "ReduceMean",
@@ -866,7 +740,7 @@
           "Config": {
             "NumWarps": 1,
             "ImplType": "WarpWise",
-            "SramBytes": 256,
+            "SramBytes": 0,
             "NumTasks": 2048
           }
         }
@@ -923,17 +797,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 31,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_6",
@@ -952,17 +819,10 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
-        }
-      ]
-    },
-    {
-      "Id": 32,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Cast",
           "Name": "cast_3",
@@ -980,8 +840,8 @@
           "Config": {
             "NumWarps": 1,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [1,4096],
+            "NumTasks": 2048
           }
         }
       ]
@@ -1012,17 +872,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 34,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Sigmoid",
           "Name": "sigmoid",
@@ -1038,19 +891,12 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 35,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_7",
@@ -1067,10 +913,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
         }
       ]
@@ -1101,17 +947,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 688
           }
-        }
-      ]
-    },
-    {
-      "Id": 37,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Mul",
           "Name": "mul_8",
@@ -1128,10 +967,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 352256
+            "Tile": [256,128],
+            "NumTasks": 688
           }
         }
       ]
@@ -1162,17 +1001,10 @@
           "Config": {
             "NumWarps": 4,
             "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
+            "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        }
-      ]
-    },
-    {
-      "Id": 39,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
+        },
         {
           "Type": "Add",
           "Name": "add_1",
@@ -1189,10 +1021,10 @@
           ],
           "Args": {},
           "Config": {
-            "NumWarps": 1,
+            "NumWarps": 4,
             "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
+            "Tile": [256,128],
+            "NumTasks": 256
           }
         }
       ]
@@ -1204,23 +1036,23 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":0,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
     },
     {
-      "ProcessorRange": [0,304],
+      "ProcessorRange": [0,32],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,304],
+          "ProcessorRange": [0,32],
           "WarpRange": [0,1],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
           ]
         }
       ]
@@ -1230,101 +1062,23 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,2048],"Granularity":1}
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
     },
     {
-      "ProcessorRange": [0,32],
+      "ProcessorRange": [0,256],
       "ResourceGroups": [
         {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
+          "ProcessorRange": [0,256],
+          "WarpRange": [0,4],
+          "SramRange": [0,24672],
           "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":9,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
           ]
         }
       ]
@@ -1342,32 +1096,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":11,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":12,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1381,19 +1109,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":14,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
@@ -1412,75 +1127,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":16,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":18,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":19,"TaskRange":[0,2097152],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":20,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":21,"TaskRange":[0,2097152],"Granularity":1}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":4}
           ]
         }
       ]
@@ -1498,19 +1148,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":23,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1529,49 +1166,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":25,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":27,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,256],
-          "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,2048],"Granularity":1}
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
@@ -1594,36 +1192,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":31,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
+          "WarpRange": [0,4],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":32,"TaskRange":[0,131072],"Granularity":1}
+            {"TaskId":30,"TaskRange":[0,2048],"Granularity":4}
           ]
         }
       ]
@@ -1641,32 +1213,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":34,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":35,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,304],
       "ResourceGroups": [
@@ -1680,19 +1226,6 @@
         }
       ]
     },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":37,"TaskRange":[0,352256],"Granularity":1}
-          ]
-        }
-      ]
-    },
     {
       "ProcessorRange": [0,256],
       "ResourceGroups": [
@@ -1705,19 +1238,6 @@
           ]
         }
       ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":39,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
     }
   ]
 }
\ No newline at end of file

From 34a87d867669aae49b2a29056aadfed694d97b33 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 8 Jul 2024 02:10:40 +0000
Subject: [PATCH 026/106] optimize

---
 examples/llama/plan_llama2_7b_b1_s2048.json | 97 ++++++++++++++++-----
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index 15b0de2d0..d5c9fe552 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -3,7 +3,7 @@
   "WorldSize": 1,
   "Architecture": "ROCM_942",
   "NumProcessors": 304,
-  "NumWarpsPerProcessor": 4,
+  "NumWarpsPerProcessor": 8,
   "TaskInfos": [
     {
       "Id": 0,
@@ -948,7 +948,7 @@
             "NumWarps": 4,
             "SramBytes": 24672,
             "TileShapeMNK": [256,128,32],
-            "NumTasks": 688
+            "NumTasks": 602
           }
         },
         {
@@ -970,7 +970,61 @@
             "NumWarps": 4,
             "SramBytes": 0,
             "Tile": [256,128],
-            "NumTasks": 688
+            "NumTasks": 602
+          }
+        }
+      ]
+    },
+    {
+      "Id": 37,
+      "NumWarps": 4,
+      "SramBytes": 16480,
+      "Ops": [
+        {
+          "Type": "Matmul",
+          "Name": "matmul_7",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":102,"DataType":"FP16","Shape":[1,1792,4096],"Strides":[1,2048,4096],"Offsets":[0,256,0],"PaddedShape":[1,1792,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":101,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":100,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {
+            "TransposeInput": {"BOOL":false},
+            "TransposeOther": {"BOOL":true}
+          },
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 16480,
+            "TileShapeMNK": [128,128,32],
+            "NumTasks": 172
+          }
+        },
+        {
+          "Type": "Mul",
+          "Name": "mul_8",
+          "IsVirtual": false,
+          "ReadTensors": [
+            {"Id":81,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
+            {"Id":83,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "WriteTensors": [
+            {"Id":84,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "ResultTensors": [
+            {"Id":85,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
+          ],
+          "Args": {},
+          "Config": {
+            "NumWarps": 4,
+            "SramBytes": 0,
+            "Tile": [128,128],
+            "NumTasks": 172
           }
         }
       ]
@@ -1036,10 +1090,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":0,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1062,10 +1116,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":4,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1114,10 +1168,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
-            {"TaskId":15,"TaskRange":[0,4096],"Granularity":1}
+            {"TaskId":15,"TaskRange":[0,4096],"Granularity":2}
           ]
         }
       ]
@@ -1127,10 +1181,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":4}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":8}
           ]
         }
       ]
@@ -1166,10 +1220,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":28,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1192,10 +1246,10 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
+          "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,2048],"Granularity":4}
+            {"TaskId":30,"TaskRange":[0,2048],"Granularity":7}
           ]
         }
       ]
@@ -1205,8 +1259,8 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
             {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
           ]
@@ -1218,10 +1272,11 @@
       "ResourceGroups": [
         {
           "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
+          "WarpRange": [0,8],
+          "SramRange": [0,49344],
           "TaskGroups": [
-            {"TaskId":36,"TaskRange":[0,688],"Granularity":1}
+            {"TaskId":36,"TaskRange":[0,602],"Granularity":2},
+            {"TaskId":37,"TaskRange":[0,172],"Granularity":1}
           ]
         }
       ]

From 866112de65a6fd5d3c3d89d80cdc53ff27c8c36a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 9 Jul 2024 01:07:21 +0000
Subject: [PATCH 027/106] optimize

---
 ark/include/kernels/common/sync.h           |  3 +
 ark/include/kernels/reduce.h                | 41 +++++++--
 examples/llama/plan_llama2_7b_b1_s2048.json | 94 +--------------------
 3 files changed, 36 insertions(+), 102 deletions(-)

diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index f47625600..456a32eb7 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,6 +106,9 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
+    } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) {
+        // asm volatile("s_waitcnt lgkmcnt(0) \n s_barrier " ::);
+        __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 3d0b4e008..2dd79d2c3 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -355,8 +355,15 @@ struct WwiseReduce {
                            int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
+        constexpr int InConsecBytes = sizeof(DataType) * InShape::W;
         constexpr int NelemPerThread =
-            DefaultNelemPerThread<OutDims, DataType, UnitOutDims>::value;
+            (InConsecBytes % 16 == 0)
+                ? 16 / sizeof(DataType)
+                : (InConsecBytes % 8 == 0)
+                    ? 8 / sizeof(DataType)
+                    : (InConsecBytes % 4 == 0)
+                        ? 4 / sizeof(DataType)
+                        : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1;
 
         constexpr int NonReduceDimLength =
             UnitOutDims::N * UnitOutDims::C * UnitOutDims::H;
@@ -397,22 +404,38 @@ struct WwiseReduce {
                                                         &in[idx_in]);
         }
 
-        DataType finalSum;
-        ReduceType::template identity<1>(&finalSum);
+        static_assert(math::is_pow2<NelemPerThread>::value,
+                      "NelemPerThread must be power of 2");
+        if constexpr (NelemPerThread > 8) {
 #pragma unroll
-        for (int i = 0; i < NelemPerThread; ++i) {
-            ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]);
+            for (int i = 8; i < NelemPerThread; i += 8) {
+                ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]);
+            }
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 8) {
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 4) {
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 2) {
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
         }
 
-        UnitOp::sync_threads();
+        if constexpr (InShape::W % ThreadsPerRow != 0) {
+            UnitOp::sync_threads();
+        }
 
         // final reduction on shared memory using warp shuffle.
-        finalSum = warpsReduce<ReduceType, UnitOp, ThreadsPerRow>(
-            finalSum, tid, smem_per_warp);
+        reduced[0] = warpsReduce<ReduceType, UnitOp, UnitOp::NumThreads>(
+            reduced[0], tid, smem_per_warp);
 
         // write the result to output.
         if (tid % ThreadsPerRow == 0) {
-            ReduceType::template postReduce<1>(&out[idx_out], &finalSum,
+            ReduceType::template postReduce<1>(&out[idx_out], &reduced[0],
                                                InShape::W);
         }
 
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
index d5c9fe552..b0bc757dc 100644
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ b/examples/llama/plan_llama2_7b_b1_s2048.json
@@ -230,29 +230,6 @@
             "Tile": [256,1,128],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -307,29 +284,6 @@
             "Tile": [256,128],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -362,29 +316,6 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -592,29 +523,6 @@
             "TileShapeMNK": [256,128,32],
             "NumTasks": 256
           }
-        },
-        {
-          "Type": "Transpose",
-          "Name": "transpose_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":56,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":57,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
         }
       ]
     },
@@ -1184,7 +1092,7 @@
           "WarpRange": [0,8],
           "SramRange": [0,0],
           "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":8}
+            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
           ]
         }
       ]

From 68e787ae377c282c9d117e6650eb112a34c54a9c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 9 Jul 2024 20:51:44 +0000
Subject: [PATCH 028/106] fix bf16 matmul

---
 ark/ops/ops_matmul.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b4553a4ed..a24b95d72 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -223,7 +223,7 @@ static const Json get_default_config(const ArchRef arch,
                 {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
-                {"SramBytes", 24672},
+                {"SramBytes", 24624},
                 {"TileShapeMNK", {tm, tn, 32}}};
     }
     ERR(InternalError, "Unexpected error");

From b18bdb2e66d30c34b21657e15bb6cf491f108544 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 10 Jul 2024 23:44:07 +0000
Subject: [PATCH 029/106] Enhance executor interfaces

---
 ark/api/executor.cpp               | 295 +++++++++++++++++++----------
 ark/gpu/gpu_event.cpp              |  11 +-
 ark/gpu/gpu_event.h                |   4 +-
 ark/gpu/gpu_kernel.cpp             |   2 +-
 ark/gpu/gpu_kernel.h               |   2 +-
 ark/gpu/gpu_manager.cpp            |  18 +-
 ark/gpu/gpu_manager.h              |   4 +-
 ark/include/ark/executor.hpp       |  46 +++--
 ark/model/model_json.cpp           |  11 +-
 ark/model/model_json.hpp           |   2 +-
 ark/model/model_op.cpp             |   5 +-
 ark/ops/ops_all_reduce_test.cpp    |   7 +-
 ark/ops/ops_communication_test.cpp |   8 +-
 ark/ops/ops_embedding_test.cpp     |   6 +-
 ark/ops/ops_test_common.cpp        |  20 +-
 ark/ops/ops_test_common.hpp        |  15 +-
 cmake/Utils.cmake                  |   2 +-
 python/ark/runtime.py              |   4 +-
 python/ark/tensor.py               |  10 +-
 python/executor_py.cpp             |  59 +++++-
 20 files changed, 344 insertions(+), 187 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 14625161f..2f50a4280 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -140,10 +140,17 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int rank, int world_size, int gpu_id, const std::string &name,
-         const std::string &plan);
+    Impl(int device_id, Stream stream, const std::string &name);
     ~Impl() = default;
 
+    void init(const PlanJson& plan);
+
+    int device_id() const { return device_id_; }
+
+    Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
+
+    std::string plan() const { return plan_json_.dump_pretty(); }
+
     void compile();
     void launch(int64_t max_spin_count);
     void run(int iter);
@@ -151,9 +158,12 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    uintptr_t tensor_address(const Tensor tensor) const;
+
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     Stream stream, bool is_d2d) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      Stream stream, bool is_d2d) const;
 
    private:
     void init_communicator();
@@ -162,14 +172,18 @@ class Executor::Impl {
     void init_channels(const std::set<int> &remote_ranks);
 
    protected:
-    const int rank_;
-    const int world_size_;
-    int gpu_id_;
+    int device_id_;
+    std::string name_;
+    gpuStream stream_raw_;
+
+    int rank_;
+    int world_size_;
 
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
+    PlanJson plan_json_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
@@ -177,8 +191,7 @@ class Executor::Impl {
     std::shared_ptr<GpuEvent> timer_end_;
     std::shared_ptr<GpuMemory> buffer_;
     std::shared_ptr<GpuHostMemory> flag_;
-    std::shared_ptr<GpuStream> main_stream_;
-    std::shared_ptr<GpuStream> copy_stream_;
+    std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
 
     // For communication
@@ -190,30 +203,35 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int rank, int world_size, int gpu_id,
-                     const std::string &name, const std::string &plan)
-    : rank_(rank), world_size_(world_size), gpu_id_(gpu_id) {
-    if (rank < 0 || rank >= world_size) {
-        ERR(InvalidUsageError, "Invalid rank ", rank, " with world size ",
-            world_size);
+Executor::Impl::Impl(int device_id, Stream stream, const std::string &name)
+    : device_id_(device_id), name_(name) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
     }
-    if (gpu_id < 0) {
-        ERR(InvalidUsageError, "Invalid GPU ID ", gpu_id);
+    if (stream) {
+        stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    } else {
+        stream_ = GpuManager::get_instance(device_id_)->create_stream();
+        stream_raw_ = stream_->get();
+    }
+}
+
+void Executor::Impl::init(const PlanJson &plan_json) {
+    plan_json_ = plan_json;
+    rank_ = plan_json_["Rank"].get<int>();
+    world_size_ = plan_json_["WorldSize"].get<int>();
+
+    if (rank_ < 0 || rank_ >= world_size_) {
+        ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
+            world_size_);
     }
     if (world_size_ > 1) {
         init_communicator();
     }
 
-    Json plan_json;
-    auto &plan_path = get_env().enforce_plan_path;
-    if (!plan_path.empty()) {
-        LOG(INFO, "Enforce executor plan path: ", plan_path);
-        plan_json = Json::parse(read_file(plan_path));
-    } else {
-        plan_json = Json::parse(plan);
-    }
+    auto gpu_manager = GpuManager::get_instance(device_id_);
 
-    buffer_id_to_offset_ = init_buffers(plan_json);
+    buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
     for (const auto &kv : buffer_id_to_offset_) {
@@ -221,17 +239,14 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ =
-        std::make_shared<CodeGenerator>(plan_json, buffer_id_to_offset_, name);
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               name_);
 
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
     buffer_ = gpu_manager->malloc(total_bytes_, 65536);
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
-    main_stream_ = gpu_manager->create_stream();
-    copy_stream_ = gpu_manager->create_stream();
 
     int threads_per_block = static_cast<int>(
         codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
@@ -241,13 +256,13 @@ Executor::Impl::Impl(int rank, int world_size, int gpu_id,
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
     if (world_size_ > 1) {
-        auto remote_ranks = init_remote_ranks(plan_json);
+        auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
 
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
-        gpu_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name,
+        device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
+        std::max(smem_block_total, size_t(4)), name_,
         {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
          std::pair<void *, size_t>{flag, sizeof(flag)}}));
 }
@@ -509,7 +524,7 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     mscclpp::TransportFlags all_transports =
         mscclpp::Transport::CudaIpc | mscclpp::Transport::Ethernet;
     if (!get_env().disable_ib) {
-        all_transports |= IBs[gpu_id_];
+        all_transports |= IBs[device_id_];
     }
     mscclpp::RegisteredMemory regmem =
         comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports);
@@ -530,12 +545,12 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
         if (remote_node == this_node) {
             add_connection(remote_rank, mscclpp::Transport::CudaIpc);
             if (!get_env().disable_ib) {
-                add_connection(remote_rank, IBs[gpu_id_]);
+                add_connection(remote_rank, IBs[device_id_]);
             }
         } else {
             add_connection(remote_rank, get_env().disable_ib
                                             ? mscclpp::Transport::Ethernet
-                                            : IBs[gpu_id_]);
+                                            : IBs[device_id_]);
         }
         comm_->sendMemoryOnSetup(regmem, remote_rank, 0);
         rank_to_remote_regmem_future[remote_rank] =
@@ -623,22 +638,22 @@ void Executor::Impl::launch(int64_t max_spin_count) {
                 sm_handles[i] = it2->second[0]->deviceHandle();
             }
         }
-        GLOG(gpuSetDevice(gpu_id_));
+        GLOG(gpuSetDevice(device_id_));
         GLOG(gpuMemcpyAsync(
             proxy_chan_addr, proxy_handles.data(),
             proxy_handles.size() *
                 sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
+            gpuMemcpyHostToDevice, stream_raw_));
         GLOG(gpuMemcpyAsync(
             proxy_secondary_chan_addr, proxy_secondary_handles.data(),
             proxy_secondary_handles.size() *
                 sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
+            gpuMemcpyHostToDevice, stream_raw_));
         GLOG(gpuMemcpyAsync(
             sm_chan_addr, sm_handles.data(),
             sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, copy_stream_->get()));
-        copy_stream_->sync();
+            gpuMemcpyHostToDevice, stream_raw_));
+        GLOG(gpuStreamSynchronize(stream_raw_));
     }
 
     elapsed_msec_ = -1;
@@ -648,7 +663,7 @@ void Executor::Impl::launch(int64_t max_spin_count) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
-    timer_begin_->record(main_stream_);
+    timer_begin_->record(stream_raw_);
 
     if (world_size_ > 1) {
         proxy_service_->startProxy();
@@ -656,8 +671,8 @@ void Executor::Impl::launch(int64_t max_spin_count) {
 
     // Initialize loop flags.
     atomicStoreRelaxed(flag_->ref<int>(), 0);
-    kernel_->launch(main_stream_);
-    timer_end_->record(main_stream_);
+    kernel_->launch(stream_raw_);
+    timer_end_->record(stream_raw_);
     is_recording_ = true;
     is_launched_ = true;
 }
@@ -677,7 +692,7 @@ void Executor::Impl::wait(int64_t max_spin_count) {
             continue;
         }
         // Check if the kernel encountered an error.
-        gpuError res = main_stream_->query();
+        gpuError res = gpuStreamQuery(stream_raw_);
         if (res == gpuSuccess) {
             if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
                 LOG(WARN, "Stream is finished but the loop flag is still set.");
@@ -699,7 +714,7 @@ void Executor::Impl::wait(int64_t max_spin_count) {
 float Executor::Impl::stop(int64_t max_spin_count) {
     this->wait(max_spin_count);
     atomicStoreRelaxed(flag_->ref<int>(), -1);
-    main_stream_->sync();
+    GLOG(gpuStreamSynchronize(stream_raw_));
     if (is_recording_) {
         elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_);
         is_recording_ = false;
@@ -717,71 +732,140 @@ void Executor::Impl::barrier() {
     }
 }
 
-void Executor::Impl::tensor_read(const Tensor tensor, void *data,
-                                 size_t bytes) const {
-    GLOG(gpuSetDevice(gpu_id_));
+uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
+    size_t buffer_id = tensor.ref()->buffer()->id();
+    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+    }
+    size_t offset = buffer_id_to_offset_.at(buffer_id);
+    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+}
+
+void Executor::Impl::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                                 Stream stream, bool is_d2d) const {
+    GLOG(gpuSetDevice(device_id_));
+    std::shared_ptr<GpuStream> copy_stream;
+    gpuStream copy_stream_raw;
+    if (stream) {
+        copy_stream_raw = reinterpret_cast<gpuStream>(stream);
+        if ((stream == stream_raw_) && is_launched_) {
+            LOG(WARN,
+                "Reading from a tensor in the same stream of the kernel "
+                "may cause a deadlock.");
+        }
+    } else {
+        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream_raw = copy_stream->get();
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Destination bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
-    size_t tensor_bytes =
-        tensor.strides().nelems() * tensor.data_type().bytes();
-    void *src =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
+    void *src = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(data, src, bytes, gpuMemcpyDeviceToHost,
-                            copy_stream_->get()));
-        copy_stream_->sync();
+        GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
+        size_t tensor_bytes =
+            tensor.strides().nelems() * tensor.data_type().bytes();
         std::vector<int8_t> tensor_host(tensor_bytes);
         GLOG(gpuMemcpyAsync(tensor_host.data(), src, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
+                            gpuMemcpyDeviceToHost, copy_stream_raw));
+        GLOG(gpuStreamSynchronize(copy_stream_raw));
+        if (!is_d2d) {
+            tensor_to_data(tensor_host.data(), static_cast<int8_t *>(data),
+                           tensor.shape(), tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+            return;
+        }
+        // TODO: convert data layout on the device directly
+        std::vector<int8_t> data_host(bytes);
+        tensor_to_data(tensor_host.data(), data_host.data(), tensor.shape(),
+                       tensor.strides(), tensor.offsets(),
                        tensor.data_type().bytes());
+        GLOG(gpuMemcpyAsync(data, data_host.data(), bytes,
+                            gpuMemcpyHostToDevice, copy_stream_raw));
     }
+    GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
 void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
-                                  size_t bytes) const {
-    GLOG(gpuSetDevice(gpu_id_));
+                                  size_t bytes, Stream stream,
+                                  bool is_d2d) const {
+    GLOG(gpuSetDevice(device_id_));
+    std::shared_ptr<GpuStream> copy_stream;
+    gpuStream copy_stream_raw;
+    if (stream) {
+        copy_stream_raw = reinterpret_cast<gpuStream>(stream);
+        if ((stream == stream_raw_) && is_launched_) {
+            LOG(WARN,
+                "Writing to a tensor in the same stream of the kernel "
+                "may cause a deadlock.");
+        }
+    } else {
+        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream_raw = copy_stream->get();
+    }
     size_t tensor_data_bytes =
         tensor.shape().nelems() * tensor.data_type().bytes();
-    if (bytes < tensor_data_bytes) {
-        ERR(InvalidUsageError, "Data buffer (", bytes,
-            ") is smaller than the tensor data (", tensor_data_bytes, ").");
+    if (bytes != tensor_data_bytes) {
+        ERR(InvalidUsageError, "Source bytes (", bytes,
+            ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
-    void *dst =
-        buffer_->ref(buffer_id_to_offset_.at(tensor.ref()->buffer()->id()));
+    auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
+    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
     if (tensor.strides() == tensor.shape()) {
-        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, gpuMemcpyHostToDevice,
-                            copy_stream_->get()));
+        GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
         std::vector<int8_t> tensor_host(tensor_bytes);
-        GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
-                            gpuMemcpyDeviceToHost, copy_stream_->get()));
-        copy_stream_->sync();
-        data_to_tensor(tensor_host.data(), static_cast<const int8_t *>(data),
-                       tensor.shape(), tensor.strides(), tensor.offsets(),
-                       tensor.data_type().bytes());
+        if (!is_d2d) {
+            GLOG(gpuMemcpyAsync(tensor_host.data(), dst, tensor_bytes,
+                                gpuMemcpyDeviceToHost, copy_stream_raw));
+            GLOG(gpuStreamSynchronize(copy_stream_raw));
+            data_to_tensor(tensor_host.data(),
+                           static_cast<const int8_t *>(data), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+        } else {
+            // TODO: convert data layout on the device directly
+            std::vector<int8_t> tmp(bytes);
+            GLOG(gpuMemcpyAsync(tmp.data(), data, bytes, gpuMemcpyDeviceToHost,
+                                copy_stream_raw));
+            GLOG(gpuStreamSynchronize(copy_stream_raw));
+            data_to_tensor(tensor_host.data(), tmp.data(), tensor.shape(),
+                           tensor.strides(), tensor.offsets(),
+                           tensor.data_type().bytes());
+        }
         GLOG(gpuMemcpyAsync(dst, tensor_host.data(), tensor_bytes,
-                            gpuMemcpyHostToDevice, copy_stream_->get()));
+                            gpuMemcpyHostToDevice, copy_stream_raw));
     }
-    copy_stream_->sync();
+    GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
-Executor::Executor(int rank, int world_size, int gpu_id,
-                   const std::string &name, const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(rank, world_size, gpu_id, name,
-                                             plan)) {}
+Executor::Executor(int device_id, Stream stream, const std::string &name,
+                   const std::string &plan)
+    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name)) {
+    auto &plan_path = get_env().enforce_plan_path;
+    if (!plan_path.empty()) {
+        LOG(INFO, "Enforce executor plan path: ", plan_path);
+        impl_->init(Json::parse(read_file(plan_path)));
+    } else if (!plan.empty()) {
+        impl_->init(Json::parse(plan));
+    }
+}
 
 Executor::~Executor() = default;
 
+int Executor::device_id() const { return impl_->device_id(); }
+
+Stream Executor::stream() const { return impl_->stream(); }
+
+std::string Executor::plan() const { return impl_->plan(); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
@@ -800,25 +884,32 @@ void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-void Executor::tensor_read(const Tensor tensor, void *data,
-                           size_t bytes) const {
-    impl_->tensor_read(tensor, data, bytes);
+uintptr_t Executor::tensor_address(const Tensor tensor) const {
+    return impl_->tensor_address(tensor);
 }
 
-void Executor::tensor_write(const Tensor tensor, const void *data,
-                            size_t bytes) const {
-    impl_->tensor_write(tensor, data, bytes);
+void Executor::tensor_read(const Tensor tensor, void *data, size_t bytes,
+                           Stream stream, bool is_d2d) const {
+    impl_->tensor_read(tensor, data, bytes, stream, is_d2d);
 }
 
-DefaultExecutor::DefaultExecutor(const Model &model, int gpu_id,
-                                 const std::string &name)
-    : Executor(
-          model.rank(), model.world_size(),
-          (gpu_id < 0) ? (model.rank() % get_env().num_ranks_per_host) : gpu_id,
-          name,
-          DefaultPlanner(model, (gpu_id < 0) ? (model.rank() %
-                                                get_env().num_ranks_per_host)
-                                             : gpu_id)
-              .plan()) {}
+void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                            Stream stream, bool is_d2d) const {
+    impl_->tensor_write(tensor, data, bytes, stream, is_d2d);
+}
+
+DefaultExecutor::DefaultExecutor(
+    const Model &model, int device_id, Stream stream,
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    const std::string &name)
+    : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
+                               : device_id,
+               stream, name, "") {
+    DefaultPlanner planner(model, impl_->device_id());
+    for (const auto &rule : config_rules) {
+        planner.install_config_rule(rule);
+    }
+    impl_->init(Json::parse(planner.plan()));
+}
 
 }  // namespace ark
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
index 93ec3fd52..cbc45d9a6 100644
--- a/ark/gpu/gpu_event.cpp
+++ b/ark/gpu/gpu_event.cpp
@@ -3,7 +3,6 @@
 
 #include "gpu/gpu_event.h"
 
-#include "gpu/gpu.h"
 #include "gpu/gpu_logging.h"
 #include "gpu/gpu_manager.h"
 
@@ -15,7 +14,7 @@ class GpuEvent::Impl {
     Impl(const Impl&) = delete;
     Impl& operator=(const Impl&) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent& other) const;
 
    private:
@@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {
 
 GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }
 
-void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
-    GLOG(gpuEventRecord(event_, stream->get()));
+void GpuEvent::Impl::record(gpuStream stream) {
+    GLOG(gpuEventRecord(event_, stream));
 }
 
 float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
@@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
 GpuEvent::GpuEvent(bool disable_timing)
     : pimpl_(std::make_shared<Impl>(disable_timing)) {}
 
-void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
-    pimpl_->record(stream);
-}
+void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }
 
 float GpuEvent::elapsed_msec(const GpuEvent& other) const {
     return pimpl_->elapsed_msec(other);
diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.h
index 4599ecaa4..081f0203b 100644
--- a/ark/gpu/gpu_event.h
+++ b/ark/gpu/gpu_event.h
@@ -6,6 +6,8 @@
 
 #include <memory>
 
+#include "gpu/gpu.h"
+
 namespace ark {
 
 class GpuStream;
@@ -17,7 +19,7 @@ class GpuEvent {
     GpuEvent(const GpuEvent &) = delete;
     GpuEvent &operator=(const GpuEvent &) = delete;
 
-    void record(std::shared_ptr<GpuStream> stream);
+    void record(gpuStream stream);
     float elapsed_msec(const GpuEvent &other) const;
 
    protected:
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index 44ff43a1d..46f467f51 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -68,7 +68,7 @@ void GpuKernel::compile() {
                                  dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(std::shared_ptr<GpuStream> stream) {
+void GpuKernel::launch(gpuStream stream) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.h
index c3b60aec4..b3be79071 100644
--- a/ark/gpu/gpu_kernel.h
+++ b/ark/gpu/gpu_kernel.h
@@ -27,7 +27,7 @@ class GpuKernel {
               const std::string& kernel_name,
               std::initializer_list<std::pair<void*, size_t>> args = {});
     void compile();
-    void launch(std::shared_ptr<GpuStream> stream);
+    void launch(gpuStream stream);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp
index 3a6d0a066..fc841fa32 100644
--- a/ark/gpu/gpu_manager.cpp
+++ b/ark/gpu/gpu_manager.cpp
@@ -20,11 +20,10 @@ class GpuManager::Impl {
 
     int gpu_id_;
     GpuManager::Info info_;
-    std::shared_ptr<GpuStream> main_stream_;
 
     void launch(gpuFunction kernel, const std::array<int, 3> &grid_dim,
                 const std::array<int, 3> &block_dim, int smem_bytes,
-                std::shared_ptr<GpuStream> stream, void **params, void **extra);
+                gpuStream stream, void **params, void **extra);
 };
 
 GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) {
@@ -76,11 +75,11 @@ GpuManager::Impl::Impl(int gpu_id) : gpu_id_(gpu_id) {
 void GpuManager::Impl::launch(gpuFunction kernel,
                               const std::array<int, 3> &grid_dim,
                               const std::array<int, 3> &block_dim,
-                              int smem_bytes, std::shared_ptr<GpuStream> stream,
-                              void **params, void **extra) {
+                              int smem_bytes, gpuStream stream, void **params,
+                              void **extra) {
     GLOG_DRV(gpuModuleLaunchKernel(
         kernel, grid_dim[0], grid_dim[1], grid_dim[2], block_dim[0],
-        block_dim[1], block_dim[2], smem_bytes, stream->get(), params, extra));
+        block_dim[1], block_dim[2], smem_bytes, stream, params, extra));
 }
 
 std::shared_ptr<GpuManager> GpuManager::get_instance(int gpu_id) {
@@ -102,9 +101,7 @@ std::shared_ptr<GpuManager> GpuManager::get_instance(int gpu_id) {
     }
 }
 
-GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared<Impl>(gpu_id)) {
-    this->pimpl_->main_stream_ = std::shared_ptr<GpuStream>(new GpuStream());
-}
+GpuManager::GpuManager(int gpu_id) : pimpl_(std::make_shared<Impl>(gpu_id)) {}
 
 std::shared_ptr<GpuMemory> GpuManager::malloc(size_t bytes, size_t align,
                                               bool expose) {
@@ -126,8 +123,6 @@ std::shared_ptr<GpuStream> GpuManager::create_stream() const {
     return std::shared_ptr<GpuStream>(new GpuStream());
 }
 
-int GpuManager::get_gpu_id() const { return pimpl_->gpu_id_; }
-
 const GpuManager::Info &GpuManager::info() const { return pimpl_->info_; }
 
 void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); }
@@ -135,8 +130,7 @@ void GpuManager::set_current() const { GLOG(gpuSetDevice(pimpl_->gpu_id_)); }
 void GpuManager::launch(gpuFunction function,
                         const std::array<int, 3> &grid_dim,
                         const std::array<int, 3> &block_dim, int smem_bytes,
-                        std::shared_ptr<GpuStream> stream, void **params,
-                        void **extra) const {
+                        gpuStream stream, void **params, void **extra) const {
     this->set_current();
     pimpl_->launch(function, grid_dim, block_dim, smem_bytes, stream, params,
                    extra);
diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.h
index 05014ac47..93a48cf7b 100644
--- a/ark/gpu/gpu_manager.h
+++ b/ark/gpu/gpu_manager.h
@@ -30,11 +30,9 @@ class GpuManager {
     std::shared_ptr<GpuEvent> create_event(bool disable_timing = false) const;
     std::shared_ptr<GpuStream> create_stream() const;
 
-    int get_gpu_id() const;
     void launch(gpuFunction function, const std::array<int, 3> &grid_dim,
                 const std::array<int, 3> &block_dim, int smem_bytes,
-                std::shared_ptr<GpuStream> stream, void **params,
-                void **extra) const;
+                gpuStream stream, void **params, void **extra) const;
 
     struct Info;
     const Info &info() const;
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 4682af7d0..75dc81c17 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -5,6 +5,7 @@
 #define ARK_EXECUTOR_HPP
 
 #include <ark/model_ref.hpp>
+#include <ark/planner.hpp>
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
@@ -12,15 +13,27 @@
 
 namespace ark {
 
+using Stream = void *;
+
 /// Convenience class for executing a model.
 class Executor {
    public:
     /// Constructor.
-    Executor(int rank, int world_size, int gpu_id, const std::string &name,
+    Executor(int device_id, Stream stream, const std::string &name,
              const std::string &plan);
 
+    /// Destructor.
     ~Executor();
 
+    /// Return the device ID.
+    int device_id() const;
+
+    /// Return the stream of the executor.
+    Stream stream() const;
+
+    /// Return the plan string.
+    std::string plan() const;
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -39,30 +52,39 @@ class Executor {
     /// again.
     float stop(int64_t max_spin_count = -1);
 
+    /// Barrier for all rank executors.
     void barrier();
 
+    /// Destroy the executor.
     void destroy();
 
+    /// Return whether the executor is destroyed.
     bool destroyed() const;
 
+    /// Return the raw virtual address of the tensor.
+    uintptr_t tensor_address(const Tensor tensor) const;
+
     template <typename T>
-    void tensor_read(const Tensor tensor, std::vector<T> &data) const {
+    void tensor_read(const Tensor tensor, std::vector<T> &data,
+                     Stream stream = nullptr) const {
         tensor_read(tensor, reinterpret_cast<void *>(data.data()),
-                    data.size() * sizeof(T));
+                    data.size() * sizeof(T), stream);
     }
 
     template <typename T>
-    void tensor_write(const Tensor tensor, const std::vector<T> &data) const {
+    void tensor_write(const Tensor tensor, const std::vector<T> &data,
+                      Stream stream = nullptr) const {
         tensor_write(tensor, reinterpret_cast<const void *>(data.data()),
-                     data.size() * sizeof(T));
+                     data.size() * sizeof(T), stream);
     }
 
-    void tensor_read(const Tensor tensor, void *data, size_t bytes) const;
+    void tensor_read(const Tensor tensor, void *data, size_t bytes,
+                     Stream stream = nullptr, bool is_d2d = false) const;
 
-    void tensor_write(const Tensor tensor, const void *data,
-                      size_t bytes) const;
+    void tensor_write(const Tensor tensor, const void *data, size_t bytes,
+                      Stream stream = nullptr, bool is_d2d = false) const;
 
-   private:
+   protected:
     class Impl;
     std::unique_ptr<Impl> impl_;
 };
@@ -71,8 +93,10 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(const Model &model, int gpu_id = -1,
-                    const std::string &name = "DefaultExecutor");
+    DefaultExecutor(
+        const Model &model, int device_id = -1, Stream stream = nullptr,
+        const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+        const std::string &name = "DefaultExecutor");
 };
 
 }  // namespace ark
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index 0057ef0aa..97ce71967 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -272,7 +272,16 @@ static void verify_format_plan(const Json &json) {
     }
 }
 
-PlanJson::PlanJson(const Json &json) : Json(json) { verify_format_plan(*this); }
+PlanJson::PlanJson(const Json &json)
+    : Json((json != nullptr) ? json
+                             : Json{{"Rank", 0},
+                                    {"WorldSize", 1},
+                                    {"NumProcessors", 1},
+                                    {"NumWarpsPerProcessor", 1},
+                                    {"TaskInfos", Json::array()},
+                                    {"ProcessorGroups", Json::array()}}) {
+    verify_format_plan(*this);
+}
 
 static std::stringstream &dump_pretty_plan(const Json &json,
                                            std::stringstream &ss, int indent,
diff --git a/ark/model/model_json.hpp b/ark/model/model_json.hpp
index cf5fbbce2..e42640a9a 100644
--- a/ark/model/model_json.hpp
+++ b/ark/model/model_json.hpp
@@ -18,7 +18,7 @@ class ModelJson : public Json {
 
 class PlanJson : public Json {
    public:
-    PlanJson(const Json &json);
+    PlanJson(const Json &json = nullptr);
     std::string dump_pretty(int indent = 0, int indent_step = 2) const;
 };
 
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 6cdba5d02..b5a0645c8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -202,8 +202,11 @@ std::shared_ptr<ModelOp> ModelOp::deserialize(const Json &serialized) {
     } else if (!serialized.contains("Args")) {
         ERR(InvalidUsageError, "ModelOp deserialization failed: missing Args");
     }
+    // Run `ModelOpT::from_name` before `construct()` to ensure all operators
+    // are registered.
+    auto op_type = ModelOpT::from_name(serialized["Type"]);
     auto ret = model_op_factory()->construct(serialized["Type"]);
-    ret->type_ = ModelOpT::from_name(serialized["Type"]);
+    ret->type_ = op_type;
     ret->name_ = serialized["Name"];
     ret->is_virtual_ = serialized["IsVirtual"];
     for (const auto &t : serialized["ReadTensors"]) {
diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 9e2c6f675..030146680 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -91,10 +91,9 @@ void test_all_reduce_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result =
-                ark::op_test("all_reduce", m, {ones}, {output},
-                             baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+            auto result = ark::op_test(
+                "all_reduce", m, {ones}, {output},
+                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 2b63642e6..f01de9789 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -229,9 +229,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             ark::Tensor tns2 = model.identity(tns2_data, {tns});
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
-            ark::DefaultPlanner planner(model, gpu_id);
-            planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
             exe.compile();
 
             std::vector<ark::half_t> data(1024);
@@ -275,9 +273,7 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
 
             ark::Tensor sum = model.add(tns2, tns_data);
 
-            ark::DefaultPlanner planner(model, gpu_id);
-            planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
             exe.compile();
 
             std::vector<ark::half_t> data(1024);
diff --git a/ark/ops/ops_embedding_test.cpp b/ark/ops/ops_embedding_test.cpp
index 822973106..8cc95abd2 100644
--- a/ark/ops/ops_embedding_test.cpp
+++ b/ark/ops/ops_embedding_test.cpp
@@ -78,9 +78,9 @@ ark::unittest::State test_embedding() {
     } else if (std::is_same<T, ark::bfloat16_t>::value) {
         type_str = "bf16";
     }
-    auto result = ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
-                               baseline_embedding<T>,
-                               {ti_data.data(), tw_data.data()}, true);
+    auto result =
+        ark::op_test("embedding_" + type_str, m, {ti, tw}, {to},
+                     baseline_embedding<T>, {ti_data.data(), tw_data.data()});
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 50317fba7..60ffc9dc2 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -31,13 +31,13 @@ std::ostream &operator<<(std::ostream &os, const OpsTestResult &result) {
     return os;
 }
 
-OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
-                      const std::vector<Tensor> &inputs,
-                      const std::vector<Tensor> &outputs,
-                      OpsTestBaseline baseline,
-                      const std::vector<void *> &inputs_data,
-                      bool print_on_error, int rank, int world_size) {
-    DefaultExecutor exe(model);
+OpsTestResult op_test(
+    const std::string &test_name_prefix, const Model &model,
+    const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+    OpsTestBaseline baseline, const std::vector<void *> &inputs_data,
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    bool print_on_error) {
+    DefaultExecutor exe(model, -1, nullptr, config_rules);
     exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
@@ -133,7 +133,8 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     for (auto t : gt) {
         gt_ptrs.push_back(t->data());
     }
-    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes, rank);
+    baseline(gt_ptrs, output_shapes, inputs_data_refs, input_shapes,
+             model.rank());
 
     std::stringstream test_name;
     test_name << test_name_prefix;
@@ -147,6 +148,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
 
     OpsTestResult result;
     result.test_name = test_name.str();
+    result.plan = exe.plan();
 
     // Compare results with the ground truth.
     for (size_t i = 0; i < outputs.size(); i++) {
@@ -187,7 +189,7 @@ OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
     GLOG(gpuDeviceSynchronize());
 
     // Throughput test.
-    if (world_size > 1) {
+    if (model.world_size() > 1) {
         // For multi-GPU, we need to make sure that all GPUs run the same
         // number of iterations. Rather than doing allgather, we just
         // use a magic number here.
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index 01e97dbb1..c5d640f3b 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -10,6 +10,7 @@
 
 #include "ark/model.hpp"
 #include "ark/model_ref.hpp"
+#include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "bfloat16.h"
 #include "half.h"
@@ -133,6 +134,7 @@ TensorCompareResult tensor_compare(T *ground_truth, T *res, Dims shape,
 
 struct OpsTestResult {
     std::string test_name;
+    std::string plan;
     int iter;
     float msec_per_iter;
     std::vector<float> mse;
@@ -165,13 +167,12 @@ using OpsTestBaseline = std::function<void(
 
 class Model;
 
-OpsTestResult op_test(const std::string &test_name_prefix, const Model &model,
-                      const std::vector<Tensor> &inputs,
-                      const std::vector<Tensor> &outputs,
-                      OpsTestBaseline baseline,
-                      const std::vector<void *> &inputs_data = {},
-                      bool print_on_error = false, int rank = 0,
-                      int world_size = 1);
+OpsTestResult op_test(
+    const std::string &test_name_prefix, const Model &model,
+    const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
+    OpsTestBaseline baseline, const std::vector<void *> &inputs_data = {},
+    const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+    bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);
 
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 9bb83fb42..855cb824b 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -14,7 +14,7 @@ if(GIT_CLANG_FORMAT)
         COMMAND ${GIT_CLANG_FORMAT} --style=file --diff || true
     )
     add_custom_target(cpplint-autofix
-        COMMAND ${GIT_CLANG_FORMAT} --style=file || true
+        COMMAND ${GIT_CLANG_FORMAT} --style=file --force --extensions cc,cpp,h,hpp,cu,in,hip || true
     )
 else()
     message(STATUS "git-clang-format not found.")
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 7480ce7da..33db1fb5c 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -106,6 +106,7 @@ def launch(
         gpu_id: int = 0,
         plan: str = "",
         plan_path: str = "",
+        stream: int = 0,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -130,9 +131,8 @@ def launch(
                     _RuntimeState.executor.destroy()
 
             _RuntimeState.executor = Executor(
-                rank,
-                world_size,
                 gpu_id,
+                stream,
                 "ArkRuntime",
                 plan,
             )
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 316d18566..d69f2aabc 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -48,7 +48,9 @@ def dtype(self) -> DataType:
         """
         return DataType.from_ctype(self._tensor.data_type())
 
-    def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
+    def to_numpy(
+        self, ndarray: np.ndarray = None, stream: int = 0
+    ) -> np.ndarray:
         """
         Copy a tensor from device to host. If `ndarray` is None,
         a new numpy array will be created. If the tensor is not allocated,
@@ -68,10 +70,10 @@ def to_numpy(self, ndarray: np.ndarray = None) -> np.ndarray:
             raise ValueError("ndarray dtype does not match the tensor")
         elif ndarray.nbytes != self.nelems() * self.dtype().element_size():
             raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_read(self._tensor, ndarray)
+        rt.executor.tensor_read(self._tensor, ndarray, stream)
         return ndarray
 
-    def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
+    def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
@@ -86,7 +88,7 @@ def from_numpy(self, ndarray: np.ndarray) -> "Tensor":
             ndarray = np.ascontiguousarray(ndarray)
         if ndarray.nbytes != self.nelems() * self.dtype().element_size():
             raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, ndarray)
+        rt.executor.tensor_write(self._tensor, ndarray, stream)
         return self
 
 
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index dc2840329..979cb2952 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -11,25 +11,48 @@
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
-                         py::buffer host_buffer) {
+                         py::buffer host_buffer, uintptr_t stream) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_write(tensor, reinterpret_cast<void *>(info.ptr),
-                      info.size * info.itemsize);
+                      info.size * info.itemsize,
+                      reinterpret_cast<ark::Stream>(stream), false);
+}
+
+static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
+                         size_t address, size_t bytes, uintptr_t stream,
+                         bool is_d2d) {
+    exe->tensor_write(tensor, reinterpret_cast<void *>(address), bytes,
+                      reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
 static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
-                        py::buffer host_buffer) {
+                        py::buffer host_buffer, uintptr_t stream) {
     py::buffer_info info = host_buffer.request();
     exe->tensor_read(tensor, reinterpret_cast<void *>(info.ptr),
-                     info.size * info.itemsize);
+                     info.size * info.itemsize,
+                     reinterpret_cast<ark::Stream>(stream), false);
+}
+
+static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
+                        size_t address, size_t bytes, uintptr_t stream,
+                        bool is_d2d) {
+    exe->tensor_read(tensor, reinterpret_cast<void *>(address), bytes,
+                     reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
-        .def(
-            py::init<int, int, int, const std::string &, const std::string &>(),
-            py::arg("rank"), py::arg("world_size"), py::arg("gpu_id"),
-            py::arg("name"), py::arg("plan"))
+        .def(py::init([](int device_id, uintptr_t stream,
+                         const std::string &name, const std::string &plan) {
+            return new ark::Executor(
+                device_id, reinterpret_cast<ark::Stream>(stream), name, plan);
+        }))
+        .def("device_id", &ark::Executor::device_id)
+        .def("stream",
+             [](ark::Executor *self) {
+                 return reinterpret_cast<uintptr_t>(self->stream());
+             })
+        .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
         .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
         .def("run", &ark::Executor::run, py::arg("iter"))
@@ -38,6 +61,22 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_read", &tensor_read, py::arg("tensor"), py::arg("data"))
-        .def("tensor_write", &tensor_write, py::arg("tensor"), py::arg("data"));
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
+                               uintptr_t>(&tensor_read),
+             py::arg("tensor"), py::arg("data"), py::arg("stream"))
+        .def("tensor_read",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, uintptr_t, bool>(&tensor_read),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("stream"), py::arg("is_d2d"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
+                               uintptr_t>(&tensor_write),
+             py::arg("tensor"), py::arg("data"), py::arg("stream"))
+        .def("tensor_write",
+             py::overload_cast<ark::Executor *, const ark::Tensor &, size_t,
+                               size_t, uintptr_t, bool>(&tensor_write),
+             py::arg("tensor"), py::arg("address"), py::arg("bytes"),
+             py::arg("stream"), py::arg("is_d2d"));
 }

From 215469044ae49a4a453f576b2a396a5c96992aec Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 10 Jul 2024 23:53:32 +0000
Subject: [PATCH 030/106] Update lint workflow

---
 .github/workflows/lint.yml | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 758eaf564..a918dcede 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -13,11 +13,8 @@ jobs:
     - name: Check out Git repository
       uses: actions/checkout@v4
 
-    - name: Install ClangFormat
-      run: sudo apt-get install -y clang-format
-
-    - name: Run clang-format
-      run: clang-format -style=file -Werror --dry-run `find ark python examples -type f -name *.h -o -name *.hpp -o -name *.c -o -name *.cc -o -name *.cpp -o -name *.cu`
+    - name: Run git-clang-format
+      run: git-clang-format --style=file --diff
 
     - name: Set up Python
       uses: actions/setup-python@v4

From 705f9f86d8bf8b70005a03fd875e8cc080c99af1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:02:45 +0000
Subject: [PATCH 031/106] Optimize operators

---
 ark/include/kernels/common/broadcast.h |  4 +-
 ark/include/kernels/common/sync.h      | 12 ++----
 ark/include/kernels/reduce.h           | 59 ++++++++++++++++++--------
 ark/ops/ops_broadcast.cpp              |  3 +-
 ark/ops/ops_matmul.cpp                 | 32 +++++++++-----
 5 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h
index 97b12e004..858938613 100644
--- a/ark/include/kernels/common/broadcast.h
+++ b/ark/include/kernels/common/broadcast.h
@@ -186,9 +186,9 @@ struct Broadcast2Intrinsic {
         (BroadcastInput0 && BroadcastInput1)
             ? OutNelemPerThread
             : BroadcastInput0
-                  ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
+                  ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
                   : BroadcastInput1
-                        ? math::gcd<OutNelemPerThread, In1NelemPerThread>::value
+                        ? math::gcd<OutNelemPerThread, In0NelemPerThread>::value
                         : math::gcd<OutNelemPerThread,
                                     math::gcd<In0NelemPerThread,
                                               In1NelemPerThread>::value>::value;
diff --git a/ark/include/kernels/common/sync.h b/ark/include/kernels/common/sync.h
index 85f7639c9..cf22e357d 100644
--- a/ark/include/kernels/common/sync.h
+++ b/ark/include/kernels/common/sync.h
@@ -106,25 +106,21 @@ DEVICE void sync_warps() {
     static_assert(Arch::ThreadsPerWarp == 64, "");
     if constexpr (NumWarps == 1) {
         __builtin_amdgcn_wave_barrier();
-    } else if constexpr (NumWarps == 16) {
+    } else if constexpr (NumWarps == ARK_WARPS_PER_BLOCK) {
         __syncthreads();
     } else {
         static_assert(ARK_SMEM_RESERVED_BYTES >= sizeof(sync::WarpGroupState),
                       "");
-        int lane_id = threadIdx.x & 63;
-        if (lane_id == 0) {
+        if ((threadIdx.x & 63) == 0) {
             constexpr int MaxOldCnt = NumWarps - 1;
-            int warp_id = threadIdx.x >> 6;
-            int group_id = warp_id / NumWarps;
+            int group_id = (threadIdx.x >> 6) / NumWarps;
             sync::WarpGroupState *state =
                 reinterpret_cast<sync::WarpGroupState *>(_ARK_SMEM);
             unsigned int tmp = state->is_inc_flag[group_id] ^ 1;
             if (atomicInc(&state->cnt[group_id], MaxOldCnt) == MaxOldCnt) {
                 state->flag[group_id] = tmp;
             } else {
-                while (atomicAdd(&state->flag[group_id], 0) != tmp)
-                    __builtin_amdgcn_s_sleep(1);
-                __asm__ __volatile__("s_wakeup");
+                while (atomicAdd(&state->flag[group_id], 0) != tmp);
             }
             state->is_inc_flag[group_id] = tmp;
         }
diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 30c8b7831..2dd79d2c3 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -53,7 +53,7 @@ DEVICE bf16 warpReduce(bf16 val) {
 template <typename ReduceType, typename UnitOp, int LanesNum, typename DataType>
 DEVICE DataType warpsReduce(DataType val, int tid, int smem_per_warp) {
     val = warpReduce<ReduceType, LanesNum>(val);
-    if (LanesNum > Arch::ThreadsPerWarp) {
+    if constexpr (LanesNum > Arch::ThreadsPerWarp) {
         ReduceSharedStorage<DataType> *shared =
             UnitOp::template shared_memory<ReduceSharedStorage<DataType>>(
                 smem_per_warp);
@@ -351,12 +351,19 @@ struct WwiseReduce {
     /// @param in Input tensor.
     /// @param uop_idx Index of the unit operator.
     template <typename DataType>
-    static DEVICE void runW(DataType *out, DataType *in, int uop_idx,
-                            int smem_per_warp) {
+    static DEVICE void run(DataType *out, DataType *in, int uop_idx,
+                           int smem_per_warp) {
         using ShapeChecker =
             ReduceShapeChecker<InShape, OutShape, UnitOutDims, Axis>;
+        constexpr int InConsecBytes = sizeof(DataType) * InShape::W;
         constexpr int NelemPerThread =
-            DefaultNelemPerThread<OutDims, DataType, UnitOutDims>::value;
+            (InConsecBytes % 16 == 0)
+                ? 16 / sizeof(DataType)
+                : (InConsecBytes % 8 == 0)
+                    ? 8 / sizeof(DataType)
+                    : (InConsecBytes % 4 == 0)
+                        ? 4 / sizeof(DataType)
+                        : (InConsecBytes % 2 == 0) ? 2 / sizeof(DataType) : 1;
 
         constexpr int NonReduceDimLength =
             UnitOutDims::N * UnitOutDims::C * UnitOutDims::H;
@@ -397,22 +404,38 @@ struct WwiseReduce {
                                                         &in[idx_in]);
         }
 
-        DataType finalSum;
-        ReduceType::template identity<1>(&finalSum);
+        static_assert(math::is_pow2<NelemPerThread>::value,
+                      "NelemPerThread must be power of 2");
+        if constexpr (NelemPerThread > 8) {
 #pragma unroll
-        for (int i = 0; i < NelemPerThread; ++i) {
-            ReduceType::template reduce<1>(&finalSum, &finalSum, &reduced[i]);
+            for (int i = 8; i < NelemPerThread; i += 8) {
+                ReduceType::template reduce<8>(&reduced[0], &reduced[0], &reduced[i]);
+            }
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 8) {
+            ReduceType::template reduce<4>(&reduced[0], &reduced[0], &reduced[4]);
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 4) {
+            ReduceType::template reduce<2>(&reduced[0], &reduced[0], &reduced[2]);
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
+        } else if constexpr (NelemPerThread == 2) {
+            ReduceType::template reduce<1>(&reduced[0], &reduced[0], &reduced[1]);
         }
 
-        UnitOp::sync_threads();
+        if constexpr (InShape::W % ThreadsPerRow != 0) {
+            UnitOp::sync_threads();
+        }
 
         // final reduction on shared memory using warp shuffle.
-        finalSum = warpsReduce<ReduceType, UnitOp, ThreadsPerRow>(
-            finalSum, tid, smem_per_warp);
+        reduced[0] = warpsReduce<ReduceType, UnitOp, UnitOp::NumThreads>(
+            reduced[0], tid, smem_per_warp);
 
         // write the result to output.
         if (tid % ThreadsPerRow == 0) {
-            ReduceType::template postReduce<1>(&out[idx_out], &finalSum,
+            ReduceType::template postReduce<1>(&out[idx_out], &reduced[0],
                                                InShape::W);
         }
 
@@ -450,8 +473,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_sum(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeSum, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeSum, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -460,8 +483,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_mean(DataType *out, DataType *in, int uop_idx,
                           int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMean, Axis>::runW(out, in, uop_idx,
-                                                       smem_per_warp);
+                SmemBytes, ReduceTypeMean, Axis>::run(out, in, uop_idx,
+                                                      smem_per_warp);
 }
 
 template <typename InDims, typename InShape, typename OutDims,
@@ -470,8 +493,8 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void reduce_w_max(DataType *out, DataType *in, int uop_idx,
                          int smem_per_warp) {
     WwiseReduce<InDims, InShape, OutDims, OutShape, UnitOutDims, NumWarps,
-                SmemBytes, ReduceTypeMax, Axis>::runW(out, in, uop_idx,
-                                                      smem_per_warp);
+                SmemBytes, ReduceTypeMax, Axis>::run(out, in, uop_idx,
+                                                     smem_per_warp);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp
index 3985a0500..e5559fc32 100644
--- a/ark/ops/ops_broadcast.cpp
+++ b/ark/ops/ops_broadcast.cpp
@@ -27,8 +27,7 @@ ModelOpBroadcast1::ModelOpBroadcast1(const std::string &type_name,
 std::string ModelOpBroadcast1::impl_name(const Json &config) const {
     check_fields_config(config, {"NumWarps", "Tile"});
     int num_warps = config.at("NumWarps");
-    auto &tile_shape = config.at("Tile");
-    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
+    Dims unit_out_dims(config.at("Tile").get<std::vector<DimType>>());
 
     return function_name_string(
         pascal_to_snake(type()->type_name()),
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index b259f99c8..a24b95d72 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -189,45 +189,55 @@ std::vector<ModelOpArg> ModelOpMatmul::impl_args([
 }
 
 static const Json get_default_config(const ArchRef arch,
-                                     const ModelDataType &data_type) {
+                                     const ModelDataType &data_type,
+                                     const Dims &mnk) {
+    if (data_type != FP32.ref() && data_type != FP16.ref() &&
+        data_type != BF16.ref()) {
+        ERR(InvalidUsageError,
+            "Unsupported data type: ", data_type->type_name());
+    }
+    if (!arch->belongs_to(ARCH_CUDA) && !arch->belongs_to(ARCH_ROCM)) {
+        ERR(InvalidUsageError, "Unsupported architecture: ", arch->name());
+    }
+    DimType tm = (mnk[0] > mnk[1]) ? 256 : 128;
+    DimType tn = (mnk[0] > mnk[1]) ? 128 : 256;
     if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) {
         return {{"NumWarps", 8},
                 {"SramBytes", 147456},
-                {"TileShapeMNK", {128, 256, 64}}};
+                {"TileShapeMNK", {tm, tn, 64}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 16}}};
+                {"TileShapeMNK", {tm, tn, 16}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) {
         return {{"NumWarps", 4},
                 {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"TileShapeMNK", {tm, tn, 32}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
         return {{"NumWarps", 4},
-                {"SramBytes", 24672},
-                {"TileShapeMNK", {128, 256, 32}}};
+                {"SramBytes", 24624},
+                {"TileShapeMNK", {tm, tn, 32}}};
     }
-    ERR(InvalidUsageError, "Unsupported arch and data type: ", arch->name(),
-        " and ", data_type->type_name());
+    ERR(InternalError, "Unexpected error");
     return {};
 }
 
 Json ModelOpMatmul::default_config(const ArchRef arch) const {
     auto result = result_tensors_[0];
-    Json config = get_default_config(arch, result->data_type());
     check_fields_args(args_, {"TransposeInput", "TransposeOther"});
     Dims mnk = calc_problem_size(read_tensors_[0]->padded_shape(),
                                  read_tensors_[1]->padded_shape(),
                                  args_.at("TransposeInput").value<bool>(),
                                  args_.at("TransposeOther").value<bool>());
+    Json config = get_default_config(arch, result->data_type(), mnk);
     size_t tile_x = config.at("TileShapeMNK")[0];
     size_t tile_y = config.at("TileShapeMNK")[1];
     if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) {

From a3114e45eea5d8c7929915e7ca1b1f9cc6ef1591 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:04:40 +0000
Subject: [PATCH 032/106] fix

---
 ark/error.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/error.hpp b/ark/error.hpp
index e08acd975..5ad21824b 100644
--- a/ark/error.hpp
+++ b/ark/error.hpp
@@ -20,6 +20,7 @@ class BaseError : public std::runtime_error {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+REGISTER_ERROR_TYPE(InternalError)
 REGISTER_ERROR_TYPE(InvalidUsageError)
 REGISTER_ERROR_TYPE(NotFoundError)
 REGISTER_ERROR_TYPE(ModelError)

From 6116424e2a692a3cec2eb749565f1ae03637e5e6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 00:28:47 +0000
Subject: [PATCH 033/106] delete an unused file

---
 plan_gpu0.json | 2423 ------------------------------------------------
 1 file changed, 2423 deletions(-)
 delete mode 100644 plan_gpu0.json

diff --git a/plan_gpu0.json b/plan_gpu0.json
deleted file mode 100644
index cad05f774..000000000
--- a/plan_gpu0.json
+++ /dev/null
@@ -1,2423 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "ROCM_942",
-  "NumProcessors": 304,
-  "NumWarpsPerProcessor": 4,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":19,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":5,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":17,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":21,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 8,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":20,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":24,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":25,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 9,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ScalarMul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":27,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Factor": {"FLOAT":0.0883883461356163}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 10,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMax",
-          "Name": "reduce_max",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":29,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 11,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sub",
-          "Name": "sub",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":30,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 12,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Exp",
-          "Name": "exp",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 13,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceSum",
-          "Name": "reduce_sum",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":33,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 14,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Div",
-          "Name": "div",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":34,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":32,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 15,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":22,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 16,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":37,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":39,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 17,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":40,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":41,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":42,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 18,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":54,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 19,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":56,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 20,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":57,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":58,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 21,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":59,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":60,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 22,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":55,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":61,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":62,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 23,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":50,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 24,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":65,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 25,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":43,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":67,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":68,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 26,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":73,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":76,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 27,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":80,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 28,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":44,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":69,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":70,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 29,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rope",
-          "Name": "rope_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":74,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":53,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":78,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 30,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,3,1]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,8],
-            "NumTasks": 131072
-          }
-        }
-      ]
-    },
-    {
-      "Id": 31,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":45,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":71,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":72,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 32,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":82,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 33,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_9",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":85,"DataType":"FP16","Shape":[1,32,128,2048],"Strides":[1,32,128,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,128,2048],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":86,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 34,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ScalarMul",
-          "Name": "mul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":46,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":88,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Factor": {"FLOAT":0.0883883461356163}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 35,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMax",
-          "Name": "reduce_max_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":90,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 36,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sub",
-          "Name": "sub_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":91,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":48,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 37,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Exp",
-          "Name": "exp_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":92,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 38,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceSum",
-          "Name": "reduce_sum_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":94,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 39,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Div",
-          "Name": "div_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":95,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":49,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":93,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 40,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_10",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":96,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":47,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":83,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":97,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 41,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Transpose",
-          "Name": "transpose_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":98,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":50,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":99,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":100,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Permutation": {"DIMS":[0,2,1,3]}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [8,128],
-            "NumTasks": 8192
-          }
-        }
-      ]
-    },
-    {
-      "Id": 42,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_11",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":101,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":51,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":46,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":102,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 43,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":103,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":52,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":104,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 44,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":106,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 45,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":108,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 46,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":109,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":55,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":110,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 47,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":111,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":56,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":112,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 48,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":107,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":54,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":113,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":57,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":114,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 49,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":51,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":115,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 50,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":116,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":58,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":117,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 51,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_12",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":47,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":119,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 52,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":121,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 53,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":120,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":60,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":122,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":61,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":123,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 54,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_13",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":118,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":59,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":49,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":125,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 55,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_9",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":124,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":62,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":126,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":63,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":127,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 56,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_14",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":128,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":64,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":48,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":129,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 57,
-      "NumWarps": 4,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":105,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":53,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":130,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":65,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":131,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":132,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":66,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 256
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,86],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":1,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":2,"TaskRange":[0,256],"Granularity":1}
-          ]
-        },
-        {
-          "ProcessorRange": [86,172],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":4,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":5,"TaskRange":[0,256],"Granularity":1}
-          ]
-        },
-        {
-          "ProcessorRange": [172,258],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,256],"Granularity":1},
-            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":8,"TaskRange":[0,4096],"Granularity":1},
-            {"TaskId":9,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":10,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":11,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":12,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":13,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":14,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":15,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":16,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":18,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":19,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":20,"TaskRange":[0,2048],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":21,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":23,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":25,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":26,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":27,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":29,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,131072],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":31,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":32,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":33,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":34,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":35,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":36,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":37,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":38,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":39,"TaskRange":[0,4096],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":40,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":41,"TaskRange":[0,8192],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":42,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":43,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":44,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":45,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":46,"TaskRange":[0,2048],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":47,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":48,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":49,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":50,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":51,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":52,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":53,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":54,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":55,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":56,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,4],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":57,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}

From 67e3b2601f00997d6debe8f9dd3e7c633ceee08b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 11 Jul 2024 01:44:53 +0000
Subject: [PATCH 034/106] update test

---
 ark/ops/ops_scalar_test.cpp | 43 +++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp
index 9e9e635b8..6ae0022f0 100644
--- a/ark/ops/ops_scalar_test.cpp
+++ b/ark/ops/ops_scalar_test.cpp
@@ -263,31 +263,28 @@ ark::unittest::State test_scalar_mul_fp16_offset() {
     {
         ark::Model m;
         ark::Tensor buf = m.tensor({1024}, ark::FP16);
-        ark::Tensor tns = m.refer(buf, {2}, {1024}, {3});
-        ark::Tensor out = m.mul(tns, 2, tns);
-
-        ark::DefaultExecutor exe(m);
-        exe.compile();
+        ark::Tensor tns = m.refer(buf, {2}, {1024}, {6});
+        ark::Tensor doubled = m.mul(tns, 2, tns);
+        ark::Tensor out = m.identity(buf, {doubled});
 
         std::vector<ark::half_t> data(1024, ark::half_t(2));
-        exe.tensor_write(buf, data);
-
-        exe.launch();
-        exe.run(1);
-        exe.stop();
-
-        data.clear();
-        data.resize(1024);
-
-        exe.tensor_read(buf, data);
-
-        for (size_t i = 0; i < data.size(); ++i) {
-            if (i == 3 || i == 4) {
-                UNITTEST_EQ(data[i], 4);
-            } else {
-                UNITTEST_EQ(data[i], 2);
-            }
-        }
+        auto result = ark::op_test(
+            "scalar_mul_fp16_offset", m, {buf}, {out},
+            [](std::vector<void *> &outputs, const std::vector<ark::Dims> &,
+               const std::vector<void *> &, const std::vector<ark::Dims> &,
+               int) {
+                ark::half_t *out = static_cast<ark::half_t *>(outputs[0]);
+                for (size_t i = 0; i < 1024; ++i) {
+                    if (i == 6 || i == 7) {
+                        out[i] = 4;
+                    } else {
+                        out[i] = 2;
+                    }
+                }
+            },
+            {data.data()});
+        UNITTEST_LOG(result);
+        UNITTEST_EQ(result.max_diff[0], 0.0f);
     }
     return ark::unittest::SUCCESS;
 }

From e1f178bd3c7bbb0023e1ffc3eceee72564116d10 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 12 Jul 2024 04:37:51 +0000
Subject: [PATCH 035/106] fix merge & updates

---
 ark/api/executor.cpp               |  3 +--
 python/ark/runtime.py              |  8 ++++----
 python/ark/tensor.py               | 17 ++++++++++-------
 python/executor_py.cpp             |  2 +-
 python/unittest/unittest_common.py | 22 ++++++++++++++++++++++
 5 files changed, 38 insertions(+), 14 deletions(-)
 create mode 100644 python/unittest/unittest_common.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 1af298e89..ad6cb8550 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -233,7 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     if (world_size_ > 1) {
         init_communicator();
     }
-}
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
 
@@ -384,7 +383,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             continue;
         }
         if (buf_info->buffer->is_external()) {
-            if (buf_info->buffer->device_id() != gpu_id_) {
+            if (buf_info->buffer->device_id() != device_id_) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 93acb6bf8..1e56fe1ca 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -142,7 +142,7 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warn(
+            logging.warning(
                 f"Runtime {self.runtime_id} is already launched, skip launching"
             )
             return
@@ -153,7 +153,7 @@ def launch(
         if self.state == Runtime.State.Init:
             if self.executor is not None:
                 if not self.executor.destroyed():
-                    logging.warn(
+                    logging.warning(
                         f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
                     )
                     self.executor.destroy()
@@ -184,7 +184,7 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warn(
+            logging.warning(
                 f"ARK runtime {self.runtime_id} is not running, skip waiting"
             )
             return
@@ -197,7 +197,7 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warn(
+            logging.warning(
                 f"ARK runtime {self.runtime_id} is never launched, skip stopping"
             )
             return
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index e377cf852..335020769 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -103,7 +103,7 @@ def to_numpy(
         return ndarray
 
     def to_torch(
-        self, tensor: torch.Tensor = None, runtime_id: int = -1
+        self, tensor: torch.Tensor = None, stream: int = 0
     ) -> torch.Tensor:
         """ """
         if _no_torch:
@@ -116,21 +116,24 @@ def to_torch(
             )
         torch_type = self.dtype().to_torch()
         if tensor is None:
-            dev_name = f"cuda:{rt.executor.gpu_id()}"
+            dev_name = f"cuda:{rt.executor.device_id()}"
             tensor = torch.zeros(
                 self.shape(), dtype=torch_type, device=torch.device(dev_name)
             )
-        elif tensor.shape != self.shape():
-            raise ValueError("torch tensor shape does not match the tensor")
+        elif list(tensor.shape) != self.shape():
+            raise ValueError(f"torch tensor shape {list(tensor.shape)} "
+                             f"does not match the tensor {self.shape()}")
         elif tensor.dtype != torch_type:
-            raise ValueError("torch tensor dtype does not match the tensor")
+            raise ValueError(f"torch tensor dtype {tensor.dtype} "
+                             f"does not match the tensor {torch_type}")
         elif not tensor.is_contiguous():
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
-            raise ValueError("torch tensor size does not match the tensor")
+            raise ValueError(f"torch tensor size {tensor.numel()} "
+                             f"does not match the tensor {self.nelems()}")
         tensor_bytes = self.nelems() * self.dtype().element_size()
         rt.executor.tensor_read(
-            self._tensor, tensor.data_ptr(), tensor_bytes, True
+            self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
         )
         return tensor
 
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index fffbb2c30..8455fa585 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -93,7 +93,7 @@ static DLManagedTensor *to_dlpack(ark::Executor &exe,
         tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
     dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
     dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(exe.gpu_id());
+    dl_tensor.device.device_id = static_cast<int32_t>(exe.device_id());
     dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
     dl_tensor.dtype = get_dl_dtype(tensor.data_type());
 
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
new file mode 100644
index 000000000..9548410b5
--- /dev/null
+++ b/python/unittest/unittest_common.py
@@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytest
+import ark
+
+
+def pytest_ark(need_torch: bool = False):
+    """
+    Decorator for ARK unit tests.
+    """
+    def decorator(test_func):
+        if need_torch:
+            try:
+                import torch
+            except ImportError:
+                return pytest.mark.skip(reason="torch is not installed")(test_func)
+        def wrapper(*args, **kwargs):
+            ark.init()
+            test_func(*args, **kwargs)
+        return wrapper
+    return decorator

From ce1959ecb5fb064b4e653b3cad7cf3dcba63a9d7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 12 Jul 2024 06:49:30 +0000
Subject: [PATCH 036/106] Add `loop_mode` argument

---
 ark/api/executor.cpp                       | 116 ++++++++++++++-------
 ark/api/planner.cpp                        |   2 +-
 ark/codegen.cpp                            |   2 +-
 ark/gpu/{gpu.h => gpu.hpp}                 |   7 +-
 ark/gpu/gpu_compile.cpp                    |   4 +-
 ark/gpu/{gpu_compile.h => gpu_compile.hpp} |   6 +-
 ark/gpu/gpu_event.cpp                      |   6 +-
 ark/gpu/{gpu_event.h => gpu_event.hpp}     |   8 +-
 ark/gpu/gpu_kernel.cpp                     |  33 ++----
 ark/gpu/{gpu_kernel.h => gpu_kernel.hpp}   |  19 ++--
 ark/gpu/gpu_kernel_test.cpp                |   8 +-
 ark/gpu/{gpu_logging.h => gpu_logging.hpp} |   8 +-
 ark/gpu/gpu_manager.cpp                    |   4 +-
 ark/gpu/{gpu_manager.h => gpu_manager.hpp} |  14 +--
 ark/gpu/gpu_memory.cpp                     |   8 +-
 ark/gpu/{gpu_memory.h => gpu_memory.hpp}   |  10 +-
 ark/gpu/gpu_stream.cpp                     |   6 +-
 ark/gpu/{gpu_stream.h => gpu_stream.hpp}   |   8 +-
 ark/include/ark/executor.hpp               |   4 +-
 ark/include/kernels/kernel_template.in     |  17 ++-
 ark/ops/ops_matmul_test.cpp                |   2 +-
 ark/ops/ops_test_common.cpp                |   2 +-
 python/ark/runtime.py                      |   4 +-
 python/executor_py.cpp                     |   8 +-
 24 files changed, 173 insertions(+), 133 deletions(-)
 rename ark/gpu/{gpu.h => gpu.hpp} (98%)
 rename ark/gpu/{gpu_compile.h => gpu_compile.hpp} (78%)
 rename ark/gpu/{gpu_event.h => gpu_event.hpp} (84%)
 rename ark/gpu/{gpu_kernel.h => gpu_kernel.hpp} (68%)
 rename ark/gpu/{gpu_logging.h => gpu_logging.hpp} (92%)
 rename ark/gpu/{gpu_manager.h => gpu_manager.hpp} (88%)
 rename ark/gpu/{gpu_memory.h => gpu_memory.hpp} (87%)
 rename ark/gpu/{gpu_stream.h => gpu_stream.hpp} (79%)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 2f50a4280..91c8e39de 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -14,11 +14,11 @@
 #include "codegen.hpp"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu.h"
-#include "gpu/gpu_event.h"
-#include "gpu/gpu_kernel.h"
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_event.hpp"
+#include "gpu/gpu_kernel.hpp"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 #include "logging.h"
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
@@ -140,7 +140,7 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int device_id, Stream stream, const std::string &name);
+    Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
     ~Impl() = default;
 
     void init(const PlanJson& plan);
@@ -174,6 +174,8 @@ class Executor::Impl {
    protected:
     int device_id_;
     std::string name_;
+    bool loop_mode_;
+
     gpuStream stream_raw_;
 
     int rank_;
@@ -203,8 +205,9 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int device_id, Stream stream, const std::string &name)
-    : device_id_(device_id), name_(name) {
+Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
+                     bool loop_mode)
+    : device_id_(device_id), name_(name), loop_mode_(loop_mode) {
     if (device_id < 0) {
         ERR(InvalidUsageError, "Invalid device ID ", device_id);
     }
@@ -251,7 +254,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     int threads_per_block = static_cast<int>(
         codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
     int num_sm = static_cast<int>(codegen_->num_procs());
-    int *flag = flag_->ref<int>();
     size_t smem_block_total =
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
@@ -260,11 +262,19 @@ void Executor::Impl::init(const PlanJson &plan_json) {
         init_channels(remote_ranks);
     }
 
+    std::string kernel_name;
+    if (loop_mode_) {
+        kernel_name = "ark_loop_kernel";
+    } else {
+        kernel_name = "ark_kernel";
+    }
+    if (!name_.empty()) {
+        kernel_name += "_" + name_;
+    }
+
     kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
         device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), name_,
-        {std::pair<void *, size_t>{buffer_->ref(), sizeof(buffer_->ref())},
-         std::pair<void *, size_t>{flag, sizeof(flag)}}));
+        std::max(smem_block_total, size_t(4)), kernel_name));
 }
 
 void Executor::Impl::init_communicator() {
@@ -669,51 +679,76 @@ void Executor::Impl::launch(int64_t max_spin_count) {
         proxy_service_->startProxy();
     }
 
-    // Initialize loop flags.
-    atomicStoreRelaxed(flag_->ref<int>(), 0);
-    kernel_->launch(stream_raw_);
-    timer_end_->record(stream_raw_);
+    if (loop_mode_) {
+        // Initialize loop flags.
+        atomicStoreRelaxed(flag_->ref<int>(), 0);
+        void *buf_ptr = buffer_->ref();
+        void *flag_ptr = flag_->ref();
+        std::vector<void *> args = {&buf_ptr, &flag_ptr};
+        kernel_->launch(stream_raw_, args);
+    }
     is_recording_ = true;
     is_launched_ = true;
 }
 
 void Executor::Impl::run(int iter) {
-    if (iter > 0) {
+    if (iter <= 0) return;
+    if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
+    } else {
+        void *buf_ptr = buffer_->ref();
+        int i = 0;
+        std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
+        for (; i < iter; i++) {
+            kernel_->launch(stream_raw_, args);
+        }
     }
 }
 
 void Executor::Impl::wait(int64_t max_spin_count) {
     int64_t cnt = max_spin_count;
-    while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-        if (cnt-- > 0) {
-            continue;
-        }
-        // Check if the kernel encountered an error.
-        gpuError res = gpuStreamQuery(stream_raw_);
-        if (res == gpuSuccess) {
-            if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                LOG(WARN, "Stream is finished but the loop flag is still set.");
-                break;
+    if (loop_mode_) {
+        while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
+            if (cnt-- > 0) {
+                continue;
+            }
+            // Check if the kernel encountered an error.
+            gpuError res = gpuStreamQuery(stream_raw_);
+            if (res == gpuSuccess) {
+                if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
+                    LOG(WARN,
+                        "Stream is finished but the loop flag is still set.");
+                    break;
+                } else {
+                    LOG(WARN,
+                        "wait() is delayed by a stream query. Regarding "
+                        "timing measurements may be inaccurate.");
+                    break;
+                }
+            } else if (res == gpuErrorNotReady) {
+                cnt = max_spin_count;
             } else {
-                LOG(WARN,
-                    "wait() is delayed by a stream query. Regarding "
-                    "timing measurements may be inaccurate.");
-                break;
+                GLOG(res);
             }
-        } else if (res == gpuErrorNotReady) {
-            cnt = max_spin_count;
-        } else {
-            GLOG(res);
         }
+    } else {
+        if (max_spin_count >= 0) {
+            LOG(WARN, "max_spin_count is ignored in non-loop mode.");
+        }
+        GLOG(gpuStreamSynchronize(stream_raw_));
     }
 }
 
 float Executor::Impl::stop(int64_t max_spin_count) {
     this->wait(max_spin_count);
-    atomicStoreRelaxed(flag_->ref<int>(), -1);
+    if (is_recording_) {
+        timer_end_->record(stream_raw_);
+    }
+    if (loop_mode_) {
+        atomicStoreRelaxed(flag_->ref<int>(), -1);
+    }
     GLOG(gpuStreamSynchronize(stream_raw_));
     if (is_recording_) {
         elapsed_msec_ = timer_end_->elapsed_msec(*timer_begin_);
@@ -847,8 +882,9 @@ void Executor::Impl::tensor_write(const Tensor tensor, const void *data,
 }
 
 Executor::Executor(int device_id, Stream stream, const std::string &name,
-                   const std::string &plan)
-    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name)) {
+                   const std::string &plan, bool loop_mode)
+    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name,
+                                             loop_mode)) {
     auto &plan_path = get_env().enforce_plan_path;
     if (!plan_path.empty()) {
         LOG(INFO, "Enforce executor plan path: ", plan_path);
@@ -901,10 +937,10 @@ void Executor::tensor_write(const Tensor tensor, const void *data, size_t bytes,
 DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<DefaultPlanner::ConfigRule> &config_rules,
-    const std::string &name)
+    const std::string &name, bool loop_mode)
     : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
                                : device_id,
-               stream, name, "") {
+               stream, name, "", loop_mode) {
     DefaultPlanner planner(model, impl_->device_id());
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 5c9d09f2e..d7fdbf807 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -6,7 +6,7 @@
 #include "ark/model.hpp"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_manager.hpp"
 #include "model/model_json.hpp"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..02a5d9ad9 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
-        {"@NAME@", name_},
+        {"@NAME@", (name_.empty() ? "" : "_" + name_)},
     };
     code_ = replace(template_code, replacements);
 }
diff --git a/ark/gpu/gpu.h b/ark/gpu/gpu.hpp
similarity index 98%
rename from ark/gpu/gpu.h
rename to ark/gpu/gpu.hpp
index 2f1eba3ba..531d6c7ee 100644
--- a/ark/gpu/gpu.h
+++ b/ark/gpu/gpu.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_H_
-#define ARK_GPU_H_
+#ifndef ARK_GPU_HPP_
+#define ARK_GPU_HPP_
 
 #include <functional>
 
@@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 // runtime API
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
@@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_H_
+#endif  // ARK_GPU_HPP_
diff --git a/ark/gpu/gpu_compile.cpp b/ark/gpu/gpu_compile.cpp
index b1c078af4..11e172f07 100644
--- a/ark/gpu/gpu_compile.cpp
+++ b/ark/gpu/gpu_compile.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_compile.h"
+#include "gpu/gpu_compile.hpp"
 
 #include <sys/types.h>
 #include <sys/wait.h>
@@ -22,7 +22,7 @@
 #include "cpu_timer.h"
 #include "env.h"
 #include "file_io.h"
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "utils/utils_string.hpp"
 
 #define ARK_DEBUG_KERNEL 0
diff --git a/ark/gpu/gpu_compile.h b/ark/gpu/gpu_compile.hpp
similarity index 78%
rename from ark/gpu/gpu_compile.h
rename to ark/gpu/gpu_compile.hpp
index 58048e78c..8b9e1a9fd 100644
--- a/ark/gpu/gpu_compile.h
+++ b/ark/gpu/gpu_compile.hpp
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_COMPILE_H_
-#define ARK_GPU_COMPILE_H_
+#ifndef ARK_GPU_COMPILE_HPP_
+#define ARK_GPU_COMPILE_HPP_
 
 #include <string>
 #include <vector>
@@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
 
 }  // namespace ark
 
-#endif  // ARK_GPU_COMPILE_H_
+#endif  // ARK_GPU_COMPILE_HPP_
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
index cbc45d9a6..06779b91a 100644
--- a/ark/gpu/gpu_event.cpp
+++ b/ark/gpu/gpu_event.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_event.h"
+#include "gpu/gpu_event.hpp"
 
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 class GpuEvent::Impl {
diff --git a/ark/gpu/gpu_event.h b/ark/gpu/gpu_event.hpp
similarity index 84%
rename from ark/gpu/gpu_event.h
rename to ark/gpu/gpu_event.hpp
index 081f0203b..bd2a7c952 100644
--- a/ark/gpu/gpu_event.h
+++ b/ark/gpu/gpu_event.hpp
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_EVENT_H_
-#define ARK_GPU_EVENT_H_
+#ifndef ARK_GPU_EVENT_HPP_
+#define ARK_GPU_EVENT_HPP_
 
 #include <memory>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -33,4 +33,4 @@ class GpuEvent {
 };
 }  // namespace ark
 
-#endif  // ARK_GPU_EVENT_H_
+#endif  // ARK_GPU_EVENT_HPP_
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index 46f467f51..d4412f80e 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -1,50 +1,38 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu_kernel.h"
+#include "gpu_kernel.hpp"
 
 #include <cassert>
 #include <cstring>
 
-#include "gpu.h"
-#include "gpu_compile.h"
-#include "gpu_logging.h"
-#include "gpu_manager.h"
+#include "gpu.hpp"
+#include "gpu_compile.hpp"
+#include "gpu_logging.hpp"
+#include "gpu_manager.hpp"
 
 namespace ark {
 
 GpuKernel::GpuKernel(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
                      const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name,
-                     std::initializer_list<std::pair<void*, size_t>> args) {
-    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name,
-               args);
+                     const std::string& kernel_name) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name);
 }
 
 void GpuKernel::init(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
                      const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name,
-                     std::initializer_list<std::pair<void*, size_t>> args) {
+                     const std::string& kernel_name) {
     gpu_manager_ = GpuManager::get_instance(gpu_id);
     code_ = code;
     block_dim_ = block_dim;
     grid_dim_ = grid_dim;
     smem_bytes_ = smem_bytes;
     kernel_name_ = kernel_name;
-    params_ptr_.resize(args.size());
-    args_.resize(args.size());
     if (kernel_name_.size() == 0) {
         ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
     }
-    size_t idx = 0;
-    for (auto& pair : args) {
-        args_[idx].reset(new uint8_t[pair.second]);
-        std::memcpy(args_[idx].get(), &(pair.first), pair.second);
-        params_ptr_[idx] = static_cast<void*>(args_[idx].get());
-        idx++;
-    }
 }
 
 void GpuKernel::compile() {
@@ -68,12 +56,13 @@ void GpuKernel::compile() {
                                  dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(gpuStream stream) {
+void GpuKernel::launch(gpuStream stream, std::vector<void*>& args) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
     gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
-                         params_ptr_.data(), nullptr);
+                         args.data(), nullptr);
+    GLOG(gpuGetLastError());
 }
 
 gpuDeviceptr GpuKernel::get_global(const std::string& name,
diff --git a/ark/gpu/gpu_kernel.h b/ark/gpu/gpu_kernel.hpp
similarity index 68%
rename from ark/gpu/gpu_kernel.h
rename to ark/gpu/gpu_kernel.hpp
index b3be79071..5308cfead 100644
--- a/ark/gpu/gpu_kernel.h
+++ b/ark/gpu/gpu_kernel.hpp
@@ -1,13 +1,14 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_KERNEL_H_
-#define ARK_GPU_KERNEL_H_
+#ifndef ARK_GPU_KERNEL_HPP_
+#define ARK_GPU_KERNEL_HPP_
 
 #include <memory>
 #include <string>
+#include <vector>
 
-#include "gpu_stream.h"
+#include "gpu_stream.hpp"
 
 namespace ark {
 
@@ -18,16 +19,14 @@ class GpuKernel {
     GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
               const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name,
-              std::initializer_list<std::pair<void*, size_t>> args = {});
+              const std::string& kernel_name);
 
     void init(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
               const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name,
-              std::initializer_list<std::pair<void*, size_t>> args = {});
+              const std::string& kernel_name);
     void compile();
-    void launch(gpuStream stream);
+    void launch(gpuStream stream, std::vector<void*>& args);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
@@ -43,10 +42,8 @@ class GpuKernel {
     std::string bin_;
     gpuModule module_;
     gpuFunction function_ = nullptr;
-    std::vector<void*> params_ptr_;
-    std::vector<std::shared_ptr<uint8_t[]>> args_;
 };
 
 }  // namespace ark
 
-#endif  // ARK_GPU_KERNEL_H_
+#endif  // ARK_GPU_KERNEL_HPP_
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 870ad7ab9..342ef9656 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_kernel.h"
+#include "gpu/gpu_kernel.hpp"
 
 #include "unittest/unittest_utils.h"
 
@@ -9,7 +9,13 @@ const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
 
 ark::unittest::State test_gpu_kernel() {
     ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    UNITTEST_TRUE(!kernel.is_compiled());
     kernel.compile();
+    UNITTEST_TRUE(kernel.is_compiled());
+    std::vector<void*> args;
+    for (int i = 0; i < 10; i++) {
+        kernel.launch(nullptr, args);
+    }
     return ark::unittest::SUCCESS;
 }
 
diff --git a/ark/gpu/gpu_logging.h b/ark/gpu/gpu_logging.hpp
similarity index 92%
rename from ark/gpu/gpu_logging.h
rename to ark/gpu/gpu_logging.hpp
index b14435b8b..5e35cc003 100644
--- a/ark/gpu/gpu_logging.h
+++ b/ark/gpu/gpu_logging.hpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_LOGGING_H_
-#define ARK_GPU_LOGGING_H_
+#ifndef ARK_GPU_LOGGING_HPP_
+#define ARK_GPU_LOGGING_HPP_
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 #include "logging.h"
 
 #define GLOG(cmd)                                           \
@@ -29,4 +29,4 @@
         }                                                                      \
     } while (0)
 
-#endif  // ARK_GPU_LOGGING_H_
+#endif  // ARK_GPU_LOGGING_HPP_
diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp
index fc841fa32..572932e35 100644
--- a/ark/gpu/gpu_manager.cpp
+++ b/ark/gpu/gpu_manager.cpp
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_manager.hpp"
 
 #include <unordered_map>
 
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "utils/utils_string.hpp"
 
 namespace ark {
diff --git a/ark/gpu/gpu_manager.h b/ark/gpu/gpu_manager.hpp
similarity index 88%
rename from ark/gpu/gpu_manager.h
rename to ark/gpu/gpu_manager.hpp
index 93a48cf7b..eeeda4d94 100644
--- a/ark/gpu/gpu_manager.h
+++ b/ark/gpu/gpu_manager.hpp
@@ -1,16 +1,16 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_MANAGER_H_
-#define ARK_GPU_MANAGER_H_
+#ifndef ARK_GPU_MANAGER_HPP_
+#define ARK_GPU_MANAGER_HPP_
 
 #include <memory>
 
 #include "arch.hpp"
-#include "gpu/gpu.h"
-#include "gpu/gpu_event.h"
-#include "gpu/gpu_memory.h"
-#include "gpu/gpu_stream.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_event.hpp"
+#include "gpu/gpu_memory.hpp"
+#include "gpu/gpu_stream.hpp"
 
 namespace ark {
 
@@ -62,4 +62,4 @@ class GpuManager {
 
 }  // namespace ark
 
-#endif  // ARK_GPU_MANAGER_H_
+#endif  // ARK_GPU_MANAGER_HPP_
diff --git a/ark/gpu/gpu_memory.cpp b/ark/gpu/gpu_memory.cpp
index 184db457c..9a854f521 100644
--- a/ark/gpu/gpu_memory.cpp
+++ b/ark/gpu/gpu_memory.cpp
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_memory.h"
+#include "gpu/gpu_memory.hpp"
 
-#include "gpu/gpu.h"
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu.hpp"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 
diff --git a/ark/gpu/gpu_memory.h b/ark/gpu/gpu_memory.hpp
similarity index 87%
rename from ark/gpu/gpu_memory.h
rename to ark/gpu/gpu_memory.hpp
index cd7a6f04f..6b277d40b 100644
--- a/ark/gpu/gpu_memory.h
+++ b/ark/gpu/gpu_memory.hpp
@@ -1,13 +1,13 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_MEMORY_H_
-#define ARK_GPU_MEMORY_H_
+#ifndef ARK_GPU_MEMORY_HPP_
+#define ARK_GPU_MEMORY_HPP_
 
 #include <memory>
 #include <vector>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -40,7 +40,7 @@ class GpuHostMemory {
     GpuHostMemory(const GpuHostMemory&) = delete;
     GpuHostMemory& operator=(const GpuHostMemory&) = delete;
 
-    template <typename T>
+    template <typename T = void>
     T* ref() const {
         return reinterpret_cast<T*>(ptr_);
     }
@@ -54,4 +54,4 @@ class GpuHostMemory {
 
 }  // namespace ark
 
-#endif  // ARK_GPU_MEMORY_H_
+#endif  // ARK_GPU_MEMORY_HPP_
diff --git a/ark/gpu/gpu_stream.cpp b/ark/gpu/gpu_stream.cpp
index 52502365a..17d4e21f5 100644
--- a/ark/gpu/gpu_stream.cpp
+++ b/ark/gpu/gpu_stream.cpp
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "gpu/gpu_stream.h"
+#include "gpu/gpu_stream.hpp"
 
-#include "gpu/gpu_logging.h"
-#include "gpu/gpu_manager.h"
+#include "gpu/gpu_logging.hpp"
+#include "gpu/gpu_manager.hpp"
 
 namespace ark {
 class GpuStream::Impl {
diff --git a/ark/gpu/gpu_stream.h b/ark/gpu/gpu_stream.hpp
similarity index 79%
rename from ark/gpu/gpu_stream.h
rename to ark/gpu/gpu_stream.hpp
index e76f01827..9d8775f95 100644
--- a/ark/gpu/gpu_stream.h
+++ b/ark/gpu/gpu_stream.hpp
@@ -1,12 +1,12 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#ifndef ARK_GPU_STREAM_H_
-#define ARK_GPU_STREAM_H_
+#ifndef ARK_GPU_STREAM_HPP_
+#define ARK_GPU_STREAM_HPP_
 
 #include <memory>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 
 namespace ark {
 
@@ -30,4 +30,4 @@ class GpuStream {
 };
 }  // namespace ark
 
-#endif  // ARK_GPU_STREAM_H_
+#endif  // ARK_GPU_STREAM_HPP_
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 75dc81c17..f0a108a1f 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -20,7 +20,7 @@ class Executor {
    public:
     /// Constructor.
     Executor(int device_id, Stream stream, const std::string &name,
-             const std::string &plan);
+             const std::string &plan, bool loop_mode = true);
 
     /// Destructor.
     ~Executor();
@@ -96,7 +96,7 @@ class DefaultExecutor : public Executor {
     DefaultExecutor(
         const Model &model, int device_id = -1, Stream stream = nullptr,
         const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor");
+        const std::string &name = "DefaultExecutor", bool loop_mode = true);
 };
 
 }  // namespace ark
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index ea1862920..a8a56f141 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE;
 
 @DEFINITIONS@
 
-__device__ void ark_loop_body(char *_buf, int _iter) {
+__device__ void ark_body(char *_buf, int _iter) {
 @BODY@
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void @NAME@(char *_buf, int *_iter) {
+void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
@@ -52,10 +52,10 @@ void @NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (ARK_ITER < 0) return;
 
-    ark_loop_body(_buf, 0);
+    ark_body(_buf, 0);
     for (int _i = 1; _i < ARK_ITER; ++_i) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
-      ark_loop_body(_buf, _i);
+      ark_body(_buf, _i);
     }
     if (threadIdx.x == 0) {
       __threadfence_system();
@@ -67,3 +67,12 @@ void @NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
   }
 }
+
+extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
+void ark_kernel@NAME@(char *_buf, int _iter) {
+  int *shared_mem = (int *)_ARK_SMEM;
+  for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
+    shared_mem[i] = 0;
+  }
+  ark_body(_buf, _iter);
+}
diff --git a/ark/ops/ops_matmul_test.cpp b/ark/ops/ops_matmul_test.cpp
index 4304a19e2..6d09b54d6 100644
--- a/ark/ops/ops_matmul_test.cpp
+++ b/ark/ops/ops_matmul_test.cpp
@@ -3,7 +3,7 @@
 
 #include <algorithm>
 
-#include "gpu/gpu.h"
+#include "gpu/gpu.hpp"
 #include "logging.h"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 60ffc9dc2..bec69c456 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -10,7 +10,7 @@
 #include "ark/planner.hpp"
 #include "ark/random.hpp"
 #include "env.h"
-#include "gpu/gpu_logging.h"
+#include "gpu/gpu_logging.hpp"
 #include "logging.h"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 33db1fb5c..d54f85c36 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -101,12 +101,11 @@ def running(self) -> bool:
 
     def launch(
         self,
-        rank: int = 0,
-        world_size: int = 1,
         gpu_id: int = 0,
         plan: str = "",
         plan_path: str = "",
         stream: int = 0,
+        loop_mode: bool = True,
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
@@ -135,6 +134,7 @@ def launch(
                 stream,
                 "ArkRuntime",
                 plan,
+                loop_mode,
             )
             self.executor = _RuntimeState.executor
             self.executor.compile()
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 979cb2952..e782a99fe 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -43,9 +43,11 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
         .def(py::init([](int device_id, uintptr_t stream,
-                         const std::string &name, const std::string &plan) {
-            return new ark::Executor(
-                device_id, reinterpret_cast<ark::Stream>(stream), name, plan);
+                         const std::string &name, const std::string &plan,
+                         bool loop_mode) {
+            return new ark::Executor(device_id,
+                                     reinterpret_cast<ark::Stream>(stream),
+                                     name, plan, loop_mode);
         }))
         .def("device_id", &ark::Executor::device_id)
         .def("stream",

From 55755bbe2e2fbc36195f7786280689bde3170ec2 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 14:19:35 -0700
Subject: [PATCH 037/106] do not force noinline

---
 ark/codegen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index cd6206284..0d4b14a09 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -213,7 +213,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
     for (auto &op_json : task_json["Ops"]) {
         ss << this->def_op(op_json, task_json["Id"], op_idx++);
     }
-    ss << "__noinline__ __device__ void t" << task_json["Id"]
+    ss << "__device__ void t" << task_json["Id"]
        << "(char* _buf, int _idx, int _spw) {\n";
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {

From b29eaaefb5b969a8e0ec8b8e3813e5e3245e7825 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 21:25:20 +0000
Subject: [PATCH 038/106] wip

---
 arkprof.py                         |  4 +++-
 python/ark/profiler.py             | 10 +++++-----
 python/ark/runtime.py              | 11 +++++++++--
 python/ark/tensor.py               | 18 ++++++++++++------
 python/unittest/unittest_common.py |  8 +++++++-
 5 files changed, 36 insertions(+), 15 deletions(-)

diff --git a/arkprof.py b/arkprof.py
index 782bba560..9e67c2dfc 100644
--- a/arkprof.py
+++ b/arkprof.py
@@ -1,4 +1,6 @@
 import ark
 import sys
 
-ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(iter=1000, profile_processor_groups=False)
+ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
+    iter=1000, profile_processor_groups=False
+)
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index 56233247c..c161b24e6 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -8,9 +8,9 @@
 from .planner import Plan
 
 
-def timeit(plan: Plan, iter: int):
+def timeit(plan: Plan, iter: int, loop_mode: bool):
     with Runtime() as rt:
-        rt.launch(plan=plan)
+        rt.launch(plan=plan, loop_mode=loop_mode)
         start_time = time.time()
         rt.run(iter=iter)
         end_time = time.time()
@@ -21,8 +21,8 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self, iter: int = 1000, profile_processor_groups: bool = False):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter):.6f} seconds/iter\n")
+    def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False):
+        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n")
 
         if not profile_processor_groups:
             return
@@ -38,7 +38,7 @@ def run(self, iter: int = 1000, profile_processor_groups: bool = False):
         }
         for i in range(num_processor_groups):
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
-            lat_per_iter = timeit(Plan(new_plan), iter)
+            lat_per_iter = timeit(Plan(new_plan), iter, loop_mode)
             sys.stderr.write(
                 f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
             )
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index b3dbe7887..51a5b7905 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -48,8 +48,15 @@ def print_runtime_states():
 
 
 class Executor(_Executor):
-    def __init__(self, device_id: int, stream: int, name: str, plan: Plan):
-        super().__init__(device_id, stream, name, str(plan))
+    def __init__(
+        self,
+        device_id: int,
+        stream: int,
+        name: str,
+        plan: Plan,
+        loop_mode: bool = True,
+    ):
+        super().__init__(device_id, stream, name, str(plan), loop_mode)
 
 
 class Runtime:
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 335020769..657da1065 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -121,16 +121,22 @@ def to_torch(
                 self.shape(), dtype=torch_type, device=torch.device(dev_name)
             )
         elif list(tensor.shape) != self.shape():
-            raise ValueError(f"torch tensor shape {list(tensor.shape)} "
-                             f"does not match the tensor {self.shape()}")
+            raise ValueError(
+                f"torch tensor shape {list(tensor.shape)} "
+                f"does not match the tensor {self.shape()}"
+            )
         elif tensor.dtype != torch_type:
-            raise ValueError(f"torch tensor dtype {tensor.dtype} "
-                             f"does not match the tensor {torch_type}")
+            raise ValueError(
+                f"torch tensor dtype {tensor.dtype} "
+                f"does not match the tensor {torch_type}"
+            )
         elif not tensor.is_contiguous():
             raise ValueError("torch tensor is not contiguous in memory")
         elif tensor.numel() != self.nelems():
-            raise ValueError(f"torch tensor size {tensor.numel()} "
-                             f"does not match the tensor {self.nelems()}")
+            raise ValueError(
+                f"torch tensor size {tensor.numel()} "
+                f"does not match the tensor {self.nelems()}"
+            )
         tensor_bytes = self.nelems() * self.dtype().element_size()
         rt.executor.tensor_read(
             self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
index 9548410b5..0c385e89a 100644
--- a/python/unittest/unittest_common.py
+++ b/python/unittest/unittest_common.py
@@ -9,14 +9,20 @@ def pytest_ark(need_torch: bool = False):
     """
     Decorator for ARK unit tests.
     """
+
     def decorator(test_func):
         if need_torch:
             try:
                 import torch
             except ImportError:
-                return pytest.mark.skip(reason="torch is not installed")(test_func)
+                return pytest.mark.skip(reason="torch is not installed")(
+                    test_func
+                )
+
         def wrapper(*args, **kwargs):
             ark.init()
             test_func(*args, **kwargs)
+
         return wrapper
+
     return decorator

From a7a5d46c001b143781022e2d28aaa3eee0c502b3 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 14 Jul 2024 23:56:21 +0000
Subject: [PATCH 039/106] Fix CK tile indexing

---
 third_party/patches/composable_kernel.patch | 89 +++++++++++++++++++--
 1 file changed, 83 insertions(+), 6 deletions(-)

diff --git a/third_party/patches/composable_kernel.patch b/third_party/patches/composable_kernel.patch
index 43b1afcaa..e12f19332 100644
--- a/third_party/patches/composable_kernel.patch
+++ b/third_party/patches/composable_kernel.patch
@@ -561,7 +561,7 @@ index 2d5dc90bf..160eef036 100644
          });
 
 diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
-index 7bb47e9d3..2b2e8c604 100644
+index 7bb47e9d3..d495c7297 100644
 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
 @@ -60,7 +60,7 @@ struct BlockToCTileMap_M00_N0_M01
@@ -582,7 +582,84 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          return true;
      }
-@@ -315,7 +315,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
+@@ -177,58 +177,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock, void>
+         index_t idx_N0 = block_1d_id % N0;
+         index_t idx_M0 = block_1d_id / N0;
+
+-        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+-
+-        index_t idx_M00          = idx_M0 / M01_;
+-        index_t idx_M01          = idx_M0 % M01_;
+-        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+-
+-        /**
+-         *                        idxN0
+-         *
+-         *           |<               mtx   N                 >|
+-         *
+-         *             NPerBlock   NPerBlock   NPerBlock   NPerBlock
+-         *                N_0         N_1        N_2         N_3
+-         *       -   |-----------|-----------|-----------|-----|-----|-
+-         *       ^   | -   -  0  |/---->  2  |           |     |     |
+-         *           | |   |     /     |     |           |     |     |  M_0  MPerBlock
+-         *           | M   |    /|     |     |           |     |     |
+-         *           |-0---|---/-|-----|-----|-----------|-----|-----|-
+-         *           | 1   |  /  |     |     |  blockid  |     |     |
+-         * idxM0     | |   | /   |     V     |     5     |     |     |  M_1  MPerBlock
+-         *           | -   V   1 |     -  3  |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *    mtx M  |           |           |           |     |     |
+-         *           |           |           |           |     |     |  M_2  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *           |           |           |           |     |     |
+-         *           |           |           |           |     |     |  M_3  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *       V   |           |           |           |     |     |
+-         *       -   |-----------|-----------|-----------|-----|-----|- M_4  MPerBlock
+-         *           |           |           |           |     |     |
+-         *           |-----------|-----------|-----------|-----|-----|-
+-         *  Example:
+-         *   assume:
+-         *      M0 = 5
+-         *      N0 = 4
+-         *      block_1d_id = 5
+-         *      M01 = 2
+-         *
+-         *   idx_N0 = 1
+-         *   idx_M0 = 1
+-         *   M01_adapt = 2
+-         *   idx_M00 = 0
+-         *   idx_M01 = 1
+-         *   idx_N0_M01_local = 5
+-         *   output {1, 2}
+-         */
+-
+-        return make_tuple(idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+-                          idx_N0_M01_local / M01_adapt);
++        return make_tuple(idx_M0, idx_N0);
+     }
+
+     template <typename CTileIdx, typename CTileDim>
+@@ -297,15 +246,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
+         index_t idx_N0 = block_1d_id % N0;
+         index_t idx_M0 = block_1d_id / N0;
+
+-        const auto M01_adapt = (idx_M0 < M0 - M0 % M01_) ? M01_ : M0 % M01_;
+-
+-        index_t idx_M00          = idx_M0 / M01_;
+-        index_t idx_M01          = idx_M0 % M01_;
+-        index_t idx_N0_M01_local = idx_N0 + idx_M01 * N0;
+-
+-        return make_tuple(idx_ksplit,
+-                          idx_N0_M01_local % M01_adapt + idx_M00 * M01_,
+-                          idx_N0_M01_local / M01_adapt);
++        return make_tuple(idx_ksplit, idx_M0, idx_N0);
+     }
+
+     template <typename CTileIdx, typename CTileDim>
+@@ -315,7 +256,7 @@ struct BlockToCTileMap_KSplit_M00_N0_M01Adapt
          return true; // always valid provided that user gets grid size from CalculateGridSize()
      }
 
@@ -591,7 +668,7 @@ index 7bb47e9d3..2b2e8c604 100644
 
      private:
      index_t M01_;
-@@ -373,7 +373,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
+@@ -373,7 +314,7 @@ struct BlockToCTileMap_M00_N00_M01_N01
              return true;
      }
 
@@ -600,7 +677,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          if constexpr(DeviceCTileIndexCheck)
              return true; // validity check moved to kernel
-@@ -485,7 +485,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
+@@ -485,7 +426,7 @@ struct BlockToCTileMap_KSplit_M00_N00_M01_N01
              return true;
      }
 
@@ -609,7 +686,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          if constexpr(DeviceCTileIndexCheck)
              return true; // validity check moved to kernel
-@@ -609,7 +609,7 @@ struct OffsettedBlockToCTileMap
+@@ -609,7 +550,7 @@ struct OffsettedBlockToCTileMap
      }
 
      template <typename CGridDesc_M_N>
@@ -618,7 +695,7 @@ index 7bb47e9d3..2b2e8c604 100644
      {
          return block_to_ctile_map_.CheckValidity(c_grid_desc_m_n);
      }
-@@ -666,7 +666,7 @@ struct BlockToCTileMap_3DGrid_KSplit
+@@ -666,7 +607,7 @@ struct BlockToCTileMap_3DGrid_KSplit
      }
 
      template <typename CGridDesc_M_N>

From 9c19a5ec8543863d159c96f05c007b63943c2566 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 29 Jul 2024 02:56:23 +0000
Subject: [PATCH 040/106] wip

---
 .vscode/settings.json                 |   2 -
 ark/api/context_manager.cpp           |  42 +++++++++
 ark/api/context_manager_test.cpp      |  54 +++++++++++
 ark/api/model.cpp                     |   4 +-
 ark/api/model_graph.cpp               |   4 +-
 ark/api/model_test.cpp                |  24 ++---
 ark/api/planner.cpp                   |   4 +-
 ark/include/ark.hpp                   |   1 +
 ark/include/ark/context_manager.hpp   |  24 +++++
 ark/include/ark/model.hpp             |  64 +++++++------
 ark/include/ark/model_graph.hpp       |   3 +-
 ark/model/model_graph_impl.cpp        |  40 ++++++++-
 ark/model/model_graph_impl.hpp        |  36 +++++++-
 ark/model/model_node.hpp              |   3 +
 ark/model/model_op.cpp                |  11 +++
 ark/model/model_op.hpp                |   9 +-
 ark/ops/ops_arithmetic.cpp            |  20 +++--
 ark/ops/ops_cast.cpp                  |  10 +--
 ark/ops/ops_communication.cpp         |  14 +--
 ark/ops/ops_copy.cpp                  |   5 +-
 ark/ops/ops_embedding.cpp             |   4 +-
 ark/ops/ops_identity.cpp              |   2 +-
 ark/ops/ops_math.cpp                  |  31 ++++---
 ark/ops/ops_matmul.cpp                |   6 +-
 ark/ops/ops_noop.cpp                  |   2 +-
 ark/ops/ops_reduce.cpp                |  12 +--
 ark/ops/ops_refer.cpp                 |   2 +-
 ark/ops/ops_reshape.cpp               |   4 +-
 ark/ops/ops_rope.cpp                  |   5 +-
 ark/ops/ops_scalar.cpp                |  31 ++++---
 ark/ops/ops_tensor.cpp                |   2 +-
 ark/ops/ops_transpose.cpp             |   5 +-
 arkprof.py                            |   1 +
 examples/tutorial/context_tutorial.py | 117 ++++++++++++++++++++++++
 python/ark/__init__.py                |   2 +-
 python/ark/context_manager.py         |  24 +++++
 python/ark/ops.py                     | 125 ++++++++++++++++++++------
 python/ark/profiler.py                |  11 ++-
 python/ark_py.cpp                     |   2 +
 python/context_manager_py.cpp         |  15 ++++
 python/model_py.cpp                   |  86 ++++++++++--------
 41 files changed, 676 insertions(+), 187 deletions(-)
 create mode 100644 ark/api/context_manager.cpp
 create mode 100644 ark/api/context_manager_test.cpp
 create mode 100644 ark/include/ark/context_manager.hpp
 create mode 100644 examples/tutorial/context_tutorial.py
 create mode 100644 python/ark/context_manager.py
 create mode 100644 python/context_manager_py.cpp

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 640196a66..00260f078 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,8 +3,6 @@
     "cmake.environment": {
         "ARK_ROOT": "${workspaceFolder}/build",
         "ARK_IGNORE_BINARY_CACHE": "1",
-        "ARK_DISABLE_GRAPH_OPT": "0",
-        "ARK_IPC_LISTEN_PORT_BASE": "42000",
         // "ARK_LOG_LEVEL": "DEBUG"
     },
     "cmake.ctestArgs": [
diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp
new file mode 100644
index 000000000..6d16d9e79
--- /dev/null
+++ b/ark/api/context_manager.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/context_manager.hpp"
+
+#include "model/model_graph_impl.hpp"
+
+namespace ark {
+
+class ContextManager::Impl {
+   public:
+    Impl(std::shared_ptr<ModelGraphContextStack> context_stack,
+         const std::map<std::string, std::string>& context_map);
+
+    ~Impl();
+
+   private:
+    std::shared_ptr<ModelGraphContextStack> context_stack_;
+    std::vector<std::string> keys_;
+};
+
+ContextManager::Impl::Impl(
+    std::shared_ptr<ModelGraphContextStack> context_stack,
+    const std::map<std::string, std::string>& context_map)
+    : context_stack_(context_stack) {
+    for (const auto& [key, value] : context_map) {
+        context_stack_->push(key, value);
+        keys_.push_back(key);
+    }
+}
+
+ContextManager::Impl::~Impl() {
+    for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) {
+        context_stack_->pop(*it);
+    }
+}
+
+ContextManager::ContextManager(
+    Model& model, const std::map<std::string, std::string>& context_map)
+    : impl_(std::make_shared<Impl>(model.impl_->context_stack_, context_map)) {}
+
+}  // namespace ark
diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
new file mode 100644
index 000000000..ff60b43bf
--- /dev/null
+++ b/ark/api/context_manager_test.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/model.hpp"
+#include "ark/context_manager.hpp"
+
+#include "model/model_node.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_context_manager() {
+    ark::Model model;
+    ark::Tensor t0 = model.tensor({1}, ark::FP32);
+    ark::Tensor t1 = model.tensor({1}, ark::FP32);
+    ark::Tensor t2 = model.add(t0, t1);
+
+    ark::Tensor t3;
+    ark::Tensor t4;
+    ark::Tensor t5;
+    {
+        ark::ContextManager cm0_1(model, {{"key0", "val1"}});
+        t3 = model.relu(t2);
+
+        ark::ContextManager cm1_1(model, {{"key1", "val2"}});
+        t4 = model.sqrt(t3);
+    }
+    {
+        ark::ContextManager cm0_2(model, {{"key0", "val3"}});
+        t5 = model.exp(t2);
+    }
+
+    UNITTEST_TRUE(model.verify());
+
+    auto compressed = model.compress(false);
+    UNITTEST_TRUE(compressed.verify());
+
+    auto nodes = compressed.nodes();
+    UNITTEST_EQ(nodes.size(), 4);
+
+    UNITTEST_EQ(nodes[0]->context.size(), 0);
+    UNITTEST_EQ(nodes[1]->context.size(), 1);
+    UNITTEST_EQ(nodes[1]->context.at("key0"), "val1");
+    UNITTEST_EQ(nodes[2]->context.size(), 2);
+    UNITTEST_EQ(nodes[2]->context.at("key0"), "val1");
+    UNITTEST_EQ(nodes[2]->context.at("key1"), "val2");
+    UNITTEST_EQ(nodes[3]->context.size(), 1);
+    UNITTEST_EQ(nodes[3]->context.at("key0"), "val3");
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_context_manager);
+    return 0;
+}
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index ab536a33c..a5a258f71 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -9,9 +9,9 @@
 
 namespace ark {
 
-Model Model::compress() const {
+Model Model::compress(bool merge_nodes) const {
     Model model(*this);
-    model.compress_nodes();
+    model.compress_nodes(merge_nodes);
     return model;
 }
 
diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp
index b6061a34e..d11808467 100644
--- a/ark/api/model_graph.cpp
+++ b/ark/api/model_graph.cpp
@@ -33,7 +33,9 @@ int ModelGraph::rank() const { return impl_->rank(); }
 
 int ModelGraph::world_size() const { return impl_->world_size(); }
 
-void ModelGraph::compress_nodes() { impl_->compress_nodes(); }
+void ModelGraph::compress_nodes(bool merge_nodes) {
+    impl_->compress_nodes(merge_nodes);
+}
 
 bool ModelGraph::compressed() const { return impl_->compressed(); }
 
diff --git a/ark/api/model_test.cpp b/ark/api/model_test.cpp
index a9d332a97..785bfcd7b 100644
--- a/ark/api/model_test.cpp
+++ b/ark/api/model_test.cpp
@@ -36,7 +36,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_TRUE(compressed.compressed());
     UNITTEST_EQ(compressed.nodes().size(), 1);
@@ -70,7 +70,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 1);
 
@@ -104,7 +104,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,ReluOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 1);
 
@@ -143,7 +143,7 @@ ark::unittest::State test_model_basics() {
     //   (AddOp,AddOp,ReluOp,AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     auto nodes = compressed.nodes();
@@ -190,7 +190,7 @@ ark::unittest::State test_model_basics() {
     //                      (AddOp,) --+--> (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -250,7 +250,7 @@ ark::unittest::State test_model_basics() {
     //                                      (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -312,7 +312,7 @@ ark::unittest::State test_model_basics() {
     //                                      (AddOp,)
     //
 
-    compressed = model.compress();
+    compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
 
     nodes = compressed.nodes();
@@ -353,7 +353,7 @@ ark::unittest::State test_model_dependent_inputs() {
     ark::Tensor x4 = m.mul(x2, x3);
     ark::Tensor y = m.add(x0, x4);
 
-    auto compressed = m.compress();
+    auto compressed = m.compress(true);
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 4);
     auto nodes_iter = nodes.begin();
@@ -399,7 +399,7 @@ ark::unittest::State test_model_noop() {
 
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     UNITTEST_EQ(compressed.nodes().size(), 0);
     return ark::unittest::SUCCESS;
@@ -425,7 +425,7 @@ ark::unittest::State test_model_identity() {
     ark::Tensor t4 = model.relu(t3);
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 3);
@@ -478,7 +478,7 @@ ark::unittest::State test_model_sharding() {
     ark::Tensor t5 = model.relu(t4);
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     UNITTEST_TRUE(compressed.verify());
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 4);
@@ -526,7 +526,7 @@ ark::unittest::State test_model_cumulate() {
 
     UNITTEST_TRUE(model.verify());
 
-    auto compressed = model.compress();
+    auto compressed = model.compress(true);
     auto nodes = compressed.nodes();
     UNITTEST_EQ(nodes.size(), 5);
 
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index f4e7fa8ee..dba149a1e 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -69,7 +69,9 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             task_info["Id"] = next_node_id++;
 
             Json config;
-            if (!config_rules_.empty()) {
+            if (!op->config().empty()) {
+                config = op->config();
+            } else if (!config_rules_.empty()) {
                 const std::string op_str = op->serialize().dump();
                 for (auto &rule : config_rules_) {
                     auto config_str = rule(op_str, gpu_info.arch->name());
diff --git a/ark/include/ark.hpp b/ark/include/ark.hpp
index 2ca796172..e76687bce 100644
--- a/ark/include/ark.hpp
+++ b/ark/include/ark.hpp
@@ -8,6 +8,7 @@
 #include <ark/version.hpp>
 // clang-format on
 
+#include <ark/context_manager.hpp>
 #include <ark/data_type.hpp>
 #include <ark/dims.hpp>
 #include <ark/error.hpp>
diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp
new file mode 100644
index 000000000..58271ea8c
--- /dev/null
+++ b/ark/include/ark/context_manager.hpp
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_CONTEXT_MANAGER_HPP
+#define ARK_CONTEXT_MANAGER_HPP
+
+#include <ark/model.hpp>
+#include <map>
+
+namespace ark {
+
+class ContextManager {
+   public:
+    ContextManager(Model& model,
+                   const std::map<std::string, std::string>& context_map);
+
+   private:
+    class Impl;
+    std::shared_ptr<Impl> impl_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_CONTEXT_MANAGER_HPP
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 66551a037..35efe53d5 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -26,7 +26,7 @@ class Model : public ModelGraph {
 
     Model &operator=(const Model &other) = default;
 
-    Model compress() const;
+    Model compress(bool merge_nodes = false) const;
 
     int unique_tag();
 
@@ -87,23 +87,29 @@ class Model : public ModelGraph {
     // result in `output`.
     // Currently, only reduction along the last dimension is supported.
     Tensor reduce_sum(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor, const std::string &name = "");
+                      Tensor output = NullTensor,
+                      const std::string &config = "",
+                      const std::string &name = "");
     Tensor reduce_mean(Tensor input, int axis, bool keepdims = true,
                        Tensor output = NullTensor,
+                       const std::string &config = "",
                        const std::string &name = "");
     Tensor reduce_max(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor, const std::string &name = "");
+                      Tensor output = NullTensor,
+                      const std::string &config = "",
+                      const std::string &name = "");
 
     // Transposes the `input` tensor according to the given `permutation`.
     // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
     // dimensions of the input tensor. Currently, only 4D tensors are supported.
     Tensor transpose(Tensor input, const std::vector<int64_t> &permutation,
-                     Tensor output = NullTensor, const std::string &name = "");
+                     Tensor output = NullTensor, const std::string &config = "",
+                     const std::string &name = "");
     // Performs matrix multiplication between the `input` tensor and another
     // `other` tensor, storing the result in `output`.
     Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor,
                   bool trans_input = false, bool trans_other = false,
-                  const std::string &name = "");
+                  const std::string &config = "", const std::string &name = "");
     // Implements the 'im2col' method for 2D convolution layers, which takes an
     // `input` tensor and reshapes it to a 2D matrix by extracting image patches
     // from the input tensor based on the provided parameters.
@@ -120,72 +126,76 @@ class Model : public ModelGraph {
                     Tensor output = NullTensor, const std::string &name = "");
     // Calculates the exponential of the `input` tensor, element-wise.
     Tensor exp(Tensor input, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Calculates the square root of the `input` tensor, element-wise.
     Tensor sqrt(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Calculates the reverse square root of the `input` tensor, element-wise.
     Tensor rsqrt(Tensor input, Tensor output = NullTensor,
-                 const std::string &name = "");
+                 const std::string &config = "", const std::string &name = "");
     // ReLU activation
     Tensor relu(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Copy the `input` tensor to `output` tensor
     Tensor copy(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     Tensor copy(float val, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Applies the Gaussian Error Linear Unit (GELU) activation function to the
     // `input` tensor, element-wise. GELU is a smooth approximation of the
     // rectifier function and is widely used in deep learning models.
     Tensor gelu(Tensor input, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     // Sigmoid activation
     Tensor sigmoid(Tensor input, Tensor output = NullTensor,
+                   const std::string &config = "",
                    const std::string &name = "");
     // Performs rotary position embedding (RoPE) on the `input` tensor
     Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
 
     // Performs an element-wise addition operator between the `input` tensor
     // and the `other` tensor
     Tensor add(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor add(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise subtraction operator between the `input` tensor
     // and the `other` tensor
     Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor sub(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise multiplication operator between the `input`
     // tensor and the `other` tensor,
     Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor mul(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     // Performs an element-wise division operator between the `input`
     // tensor and the `other` tensor,
     Tensor div(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
     Tensor div(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &name = "");
+               const std::string &config = "", const std::string &name = "");
 
     Tensor send(Tensor input, int remote_rank, int tag,
-                Tensor output = NullTensor, const std::string &name = "");
+                Tensor output = NullTensor, const std::string &config = "",
+                const std::string &name = "");
     // Blocks the execution until the corresponding 'send' operator with the
     // specified `id` is completed.
-    Tensor send_done(Tensor input, const std::string &name = "");
+    Tensor send_done(Tensor input, const std::string &config = "",
+                     const std::string &name = "");
     // Receives a tensor from a source rank (@p src_rank), identified by the
     // `id` parameter. Blocks the execution until the corresponding 'recv'
     // operator is completed.
     Tensor recv(Tensor output, int remote_rank, int tag,
-                const std::string &name = "");
+                const std::string &config = "", const std::string &name = "");
     //
     Tensor put_packet(Tensor input, Tensor local_tmp_buf, Tensor recv_buf,
                       int id, int rank, int dst_rank, size_t dst_offset,
-                      int flag, const std::string &name = "");
+                      int flag, const std::string &config = "",
+                      const std::string &name = "");
     // Performs an all-reduce operator across all ranks, aggregating the input
     // tensors. Takes the `input` tensor, the current GPU's rank, and the
     // total number of ranks `rank_num`.
@@ -200,10 +210,12 @@ class Model : public ModelGraph {
                                    const std::string &name = "");
     /// Embedding layer.
     Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor,
+                     const std::string &config = "",
                      const std::string &name = "");
     /// Tensor type casting.
     Tensor cast(Tensor input, const DataType &data_type,
-                Tensor output = NullTensor, const std::string &name = "");
+                Tensor output = NullTensor, const std::string &config = "",
+                const std::string &name = "");
 
     // sync across multi devices
     Tensor device_sync(Tensor input, int npeers, const std::string &name = "");
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index bd7c59033..f6390a2a9 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -25,7 +25,7 @@ class ModelGraph {
 
     int world_size() const;
 
-    void compress_nodes();
+    void compress_nodes(bool merge_nodes = false);
 
     bool compressed() const;
 
@@ -38,6 +38,7 @@ class ModelGraph {
 
    protected:
     friend class Model;
+    friend class ContextManager;
 
     class Impl;
     std::unique_ptr<Impl> impl_;
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 17410d23f..53a7fa851 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -17,6 +17,36 @@
 
 namespace ark {
 
+ModelGraphContextStack::ModelGraphContextStack(const ModelGraphContextStack &other) {
+    for (const auto &pair : other.storage_) {
+        for (const auto &value : pair.second) {
+            this->storage_[pair.first].push_back(value);
+        }
+    }
+}
+
+void ModelGraphContextStack::push(const std::string &key, const std::string &value) {
+    this->storage_[key].push_back(std::make_shared<std::string>(value));
+}
+
+void ModelGraphContextStack::pop(const std::string &key) {
+    auto it = this->storage_.find(key);
+    if (it == this->storage_.end() || it->second.empty()) {
+        ERR(ModelError, "context stack is empty");
+    }
+    it->second.pop_back();
+}
+
+std::map<std::string, std::string> ModelGraphContextStack::current_context() const {
+    std::map<std::string, std::string> cur;
+    for (const auto &pair : this->storage_) {
+        if (!pair.second.empty()) {
+            cur[pair.first] = *pair.second.back();
+        }
+    }
+    return cur;
+}
+
 ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; }
 
 ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
@@ -25,6 +55,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     for (const auto &node : other.nodes_) {
         ModelNodeRef new_node = std::make_shared<ModelNode>();
         new_node->ops = node->ops;
+        new_node->context = node->context;
         node_map.emplace(node, new_node);
         nodes_.push_back(new_node);
     }
@@ -61,13 +92,16 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     rank_ = other.rank_;
     world_size_ = other.world_size_;
     compressed_ = other.compressed_;
+    context_stack_ = std::make_shared<ModelGraphContextStack>(*(other.context_stack_));
     return *this;
 }
 
-void ModelGraph::Impl::compress_nodes() {
+void ModelGraph::Impl::compress_nodes(bool merge_nodes) {
     if (!compressed_) {
         this->recursive_remove_virtual_nodes();
-        this->recursive_merge_nodes();
+        if (merge_nodes) {
+            this->recursive_merge_nodes();
+        }
         compressed_ = true;
     }
 }
@@ -171,6 +205,8 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
         producer->consumers.push_back(node);
     }
 
+    node->context = context_stack_->current_context();
+
     nodes_.push_back(node);
     return node;
 }
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index 6c109b51e..fbfc54c7e 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -4,6 +4,7 @@
 #ifndef ARK_MODEL_GRAPH_IMPL_HPP_
 #define ARK_MODEL_GRAPH_IMPL_HPP_
 
+#include <list>
 #include <map>
 #include <set>
 #include <tuple>
@@ -18,17 +19,39 @@
 
 namespace ark {
 
+class ModelGraphContextStack {
+   private:
+    std::map<std::string, std::list<std::shared_ptr<std::string>>> storage_;
+
+   public:
+    ModelGraphContextStack() = default;
+
+    ModelGraphContextStack(const ModelGraphContextStack &other);
+
+    ~ModelGraphContextStack() = default;
+
+    void push(const std::string &key, const std::string &value);
+
+    void pop(const std::string &key);
+
+    std::map<std::string, std::string> current_context() const;
+};
+
 class ModelGraph::Impl {
    public:
     Impl(int rank, int world_size)
-        : rank_(rank), world_size_(world_size), compressed_(false){};
+        : rank_(rank),
+          world_size_(world_size),
+          compressed_(false),
+          context_stack_(std::make_shared<ModelGraphContextStack>()) {};
 
     Impl(const Impl &other);
 
     Impl &operator=(const Impl &other);
 
     template <typename T, typename... Args>
-    ModelOpRef create_op(const std::string &name, Args &&... args) {
+    ModelOpRef create_op(const std::string &config, const std::string &name,
+                         Args &&...args) {
         ModelOpRef op = std::make_shared<T>(std::forward<Args>(args)...);
         std::string name_copy;
         if (name.empty()) {
@@ -41,6 +64,7 @@ class ModelGraph::Impl {
         if (count > 0) {
             name_copy += "_" + std::to_string(count);
         }
+        op->set_config(config);
         op->set_name(name_copy);
         add_op(op);
         return op;
@@ -50,7 +74,7 @@ class ModelGraph::Impl {
 
     int world_size() const { return world_size_; }
 
-    void compress_nodes();
+    void compress_nodes(bool merge_nodes = false);
 
     bool compressed() const { return compressed_; }
 
@@ -100,6 +124,12 @@ class ModelGraph::Impl {
 
     /// True if `compress_nodes` has been called.
     bool compressed_;
+
+   protected:
+    friend class ContextManager;
+
+    /// Graph context stack.
+    std::shared_ptr<ModelGraphContextStack> context_stack_;
 };
 
 }  // namespace ark
diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp
index 7838ca120..c86b4d29a 100644
--- a/ark/model/model_node.hpp
+++ b/ark/model/model_node.hpp
@@ -26,6 +26,9 @@ class ModelNode {
 
     /// The list of @ref ModelNode that this @ref ModelNode depends on.
     UniqueList<ModelNodeRef> producers;
+
+    /// Graph context of this node.
+    std::map<std::string, std::string> context;
 };
 
 }  // namespace ark
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index b5a0645c8..e9689cdcb 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -87,6 +87,14 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
     return it->second;
 }
 
+void ModelOp::set_config(const std::string &config) {
+    if (!config.empty()) {
+        config_ = Json::parse(config);
+    } else {
+        config_.clear();
+    }
+}
+
 std::vector<ModelTensorRef> ModelOp::input_tensors() const {
     // input_tensors = read_tensors || write_tensors
     std::set<ModelTensorRef> input_tensors;
@@ -179,6 +187,9 @@ Json ModelOp::serialize() const {
     for (auto &arg : args_) {
         j["Args"][arg.first] = arg.second.serialize();
     }
+    if (!config_.empty()) {
+        j["Config"] = config_;
+    }
     return j;
 }
 
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
index e8c220258..091a9f163 100644
--- a/ark/model/model_op.hpp
+++ b/ark/model/model_op.hpp
@@ -50,8 +50,8 @@ class ModelOp {
         return "";
     }
 
-    virtual std::vector<ModelOpArg> impl_args([
-        [maybe_unused]] const Json &config) const {
+    virtual std::vector<ModelOpArg> impl_args(
+        [[maybe_unused]] const Json &config) const {
         return {};
     }
 
@@ -60,10 +60,14 @@ class ModelOp {
         return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}};
     }
 
+    void set_config(const std::string &config);
+
     void set_name(const std::string &name) { name_ = name; }
 
     ModelOpType type() const { return type_; }
 
+    const Json &config() const { return config_; }
+
     const std::string &name() const { return name_; }
 
     bool is_virtual() const { return is_virtual_; }
@@ -100,6 +104,7 @@ class ModelOp {
         const std::vector<std::string> &template_args = {});
 
     ModelOpType type_;
+    Json config_;
     std::string name_;
     bool is_virtual_;
     std::vector<ModelTensorRef> read_tensors_;
diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp
index aeece0d77..ef85b5d22 100644
--- a/ark/ops/ops_arithmetic.cpp
+++ b/ark/ops/ops_arithmetic.cpp
@@ -12,9 +12,10 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Add", input, other, output) {}
 
 Tensor Model::add(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpAdd>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpAdd>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -23,9 +24,10 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Mul", input, other, output) {}
 
 Tensor Model::mul(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpMul>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpMul>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -34,9 +36,10 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Sub", input, other, output) {}
 
 Tensor Model::sub(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpSub>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpSub>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
@@ -45,9 +48,10 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Div", input, other, output) {}
 
 Tensor Model::div(Tensor input, Tensor other, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpDiv>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpDiv>(config, name, input.ref_, other.ref_,
+                                output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp
index 9873c8367..e9527ad8c 100644
--- a/ark/ops/ops_cast.cpp
+++ b/ark/ops/ops_cast.cpp
@@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type,
 }
 
 Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     check_null(input.ref());
     if (output.is_null()) {
         if (input.data_type() == data_type) {
@@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
             byte_cast_helper(input.ref(), data_type.ref(), new_shape,
                              new_strides, new_offsets, new_padded_shape);
             return impl_
-                ->create_op<ModelOpByteCast>(name, input.ref(), data_type.ref(),
-                                             new_shape, new_strides,
-                                             new_offsets, new_padded_shape)
+                ->create_op<ModelOpByteCast>(
+                    config, name, input.ref(), data_type.ref(), new_shape,
+                    new_strides, new_offsets, new_padded_shape)
                 ->result_tensors()[0];
         }
     }
     return impl_
-        ->create_op<ModelOpCast>(name, input.ref(), data_type.ref(),
+        ->create_op<ModelOpCast>(config, name, input.ref(), data_type.ref(),
                                  output.ref())
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp
index e335f869e..4e76d2ede 100644
--- a/ark/ops/ops_communication.cpp
+++ b/ark/ops/ops_communication.cpp
@@ -157,23 +157,25 @@ Json ModelOpRecv::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     tags_.insert(tag);
     return impl_
-        ->create_op<ModelOpSend>(name, input.ref(), remote_rank, tag,
+        ->create_op<ModelOpSend>(config, name, input.ref(), remote_rank, tag,
                                  output.ref())
         ->result_tensors()[0];
 }
 
-Tensor Model::send_done(Tensor input, const std::string &name) {
-    return impl_->create_op<ModelOpSendDone>(name, input.ref())
+Tensor Model::send_done(Tensor input, const std::string &config,
+                        const std::string &name) {
+    return impl_->create_op<ModelOpSendDone>(config, name, input.ref())
         ->result_tensors()[0];
 }
 
 Tensor Model::recv(Tensor output, int remote_rank, int tag,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     tags_.insert(tag);
-    return impl_->create_op<ModelOpRecv>(name, output.ref(), remote_rank, tag)
+    return impl_
+        ->create_op<ModelOpRecv>(config, name, output.ref(), remote_rank, tag)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp
index 4f32966b8..4914c34a4 100644
--- a/ark/ops/ops_copy.cpp
+++ b/ark/ops/ops_copy.cpp
@@ -20,8 +20,9 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output)
     verify();
 }
 
-Tensor Model::copy(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpCopy>(name, input.ref_, output.ref_)
+Tensor Model::copy(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpCopy>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp
index 542c0fcac..466b9a4e5 100644
--- a/ark/ops/ops_embedding.cpp
+++ b/ark/ops/ops_embedding.cpp
@@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([
 }
 
 Tensor Model::embedding(Tensor input, Tensor weight, Tensor output,
-                        const std::string &name) {
+                        const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpEmbedding>(name, input.ref_, weight.ref_,
+        ->create_op<ModelOpEmbedding>(config, name, input.ref_, weight.ref_,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp
index 065cd9a52..dd398d8a5 100644
--- a/ark/ops/ops_identity.cpp
+++ b/ark/ops/ops_identity.cpp
@@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector<Tensor> &deps,
     for (auto &dep : deps) {
         deps_ref.emplace_back(dep.ref_);
     }
-    return impl_->create_op<ModelOpIdentity>(name, input.ref_, deps_ref)
+    return impl_->create_op<ModelOpIdentity>("", name, input.ref_, deps_ref)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp
index 1067c561a..b2833dcca 100644
--- a/ark/ops/ops_math.cpp
+++ b/ark/ops/ops_math.cpp
@@ -24,48 +24,55 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input,
 ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Exp", input, output) {}
 
-Tensor Model::exp(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpExp>(name, input.ref_, output.ref_)
+Tensor Model::exp(Tensor input, Tensor output, const std::string &config,
+                  const std::string &name) {
+    return impl_->create_op<ModelOpExp>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Gelu", input, output) {}
 
-Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpGelu>(name, input.ref_, output.ref_)
+Tensor Model::gelu(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpGelu>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Relu", input, output) {}
 
-Tensor Model::relu(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpRelu>(name, input.ref_, output.ref_)
+Tensor Model::relu(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpRelu>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Rsqrt", input, output) {}
 
-Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpRsqrt>(name, input.ref_, output.ref_)
+Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config,
+                    const std::string &name) {
+    return impl_->create_op<ModelOpRsqrt>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sigmoid", input, output) {}
 
-Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpSigmoid>(name, input.ref_, output.ref_)
+Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config,
+                      const std::string &name) {
+    return impl_
+        ->create_op<ModelOpSigmoid>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sqrt", input, output) {}
 
-Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) {
-    return impl_->create_op<ModelOpSqrt>(name, input.ref_, output.ref_)
+Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config,
+                   const std::string &name) {
+    return impl_->create_op<ModelOpSqrt>(config, name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index a24b95d72..1976699a1 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -255,10 +255,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const {
 
 Tensor Model::matmul(Tensor input, Tensor other, Tensor output,
                      bool trans_input, bool trans_other,
-                     const std::string &name) {
+                     const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpMatmul>(name, input.ref(), other.ref(), output.ref(),
-                                   trans_input, trans_other)
+        ->create_op<ModelOpMatmul>(config, name, input.ref(), other.ref(),
+                                   output.ref(), trans_input, trans_other)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp
index 894ab29be..42fe5fdf5 100644
--- a/ark/ops/ops_noop.cpp
+++ b/ark/ops/ops_noop.cpp
@@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 void Model::noop(Tensor input, const std::string &name) {
-    impl_->create_op<ModelOpNoop>(name, input.ref_);
+    impl_->create_op<ModelOpNoop>("", name, input.ref_);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp
index 1c91a2f0b..dadd049d2 100644
--- a/ark/ops/ops_reduce.cpp
+++ b/ark/ops/ops_reduce.cpp
@@ -128,25 +128,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &name) {
+                         const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMax>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMax>(config, name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output,
-                          const std::string &name) {
+                          const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMean>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMean>(config, name, input.ref_, axis, keepdims,
                                        output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &name) {
+                         const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceSum>(name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceSum>(config, name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp
index 782d6708c..68c61b30f 100644
--- a/ark/ops/ops_refer.cpp
+++ b/ark/ops/ops_refer.cpp
@@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides,
                     const Dims &offsets, const Dims &padded_shape,
                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpRefer>(name, input.ref_, shape, strides, offsets,
+        ->create_op<ModelOpRefer>("", name, input.ref_, shape, strides, offsets,
                                   padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp
index c4e192908..6ecbba466 100644
--- a/ark/ops/ops_reshape.cpp
+++ b/ark/ops/ops_reshape.cpp
@@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero,
     reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape,
                    new_strides, new_offs);
     return impl_
-        ->create_op<ModelOpReshape>(name, input.ref_, new_shape, new_strides,
-                                    new_offs)
+        ->create_op<ModelOpReshape>("", name, input.ref_, new_shape,
+                                    new_strides, new_offs)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp
index 06c1c915e..36015aae5 100644
--- a/ark/ops/ops_rope.cpp
+++ b/ark/ops/ops_rope.cpp
@@ -12,9 +12,10 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Rope", input, other, output) {}
 
 Tensor Model::rope(Tensor input, Tensor other, Tensor output,
-                   const std::string &name) {
+                   const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpRope>(name, input.ref_, other.ref_, output.ref_)
+        ->create_op<ModelOpRope>(config, name, input.ref_, other.ref_,
+                                 output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp
index 944a7247c..b5c10f1c3 100644
--- a/ark/ops/ops_scalar.cpp
+++ b/ark/ops/ops_scalar.cpp
@@ -115,20 +115,21 @@ std::vector<ModelOpArg> ModelOpScalarMul::impl_args([
 Tensor Model::constant(float val, const Dims &shape, DataType data_type,
                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAssign>(name, val, shape, data_type.ref(),
+        ->create_op<ModelOpScalarAssign>("", name, val, shape, data_type.ref(),
                                          nullptr)
         ->result_tensors()[0];
 }
 
-Tensor Model::copy(float val, Tensor output, const std::string &name) {
+Tensor Model::copy(float val, Tensor output, const std::string &config,
+                   const std::string &name) {
     if (output == NullTensor) {
         return impl_
-            ->create_op<ModelOpScalarAssign>(name, val, Dims{1}, FP32.ref(),
-                                             output.ref())
+            ->create_op<ModelOpScalarAssign>(config, name, val, Dims{1},
+                                             FP32.ref(), output.ref())
             ->result_tensors()[0];
     } else {
         return impl_
-            ->create_op<ModelOpScalarAssign>(name, val, output.shape(),
+            ->create_op<ModelOpScalarAssign>(config, name, val, output.shape(),
                                              output.data_type().ref(),
                                              output.ref())
             ->result_tensors()[0];
@@ -136,30 +137,34 @@ Tensor Model::copy(float val, Tensor output, const std::string &name) {
 }
 
 Tensor Model::add(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(name, input.ref_, value, output.ref_)
+        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::sub(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(name, input.ref_, -value, output.ref_)
+        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, -value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::mul(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(name, input.ref_, value, output.ref_)
+        ->create_op<ModelOpScalarMul>(config, name, input.ref_, value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::div(Tensor input, float value, Tensor output,
-                  const std::string &name) {
+                  const std::string &config, const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(name, input.ref_, 1 / value, output.ref_)
+        ->create_op<ModelOpScalarMul>(config, name, input.ref_, 1 / value,
+                                      output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_tensor.cpp b/ark/ops/ops_tensor.cpp
index 0279ab311..77091fa57 100644
--- a/ark/ops/ops_tensor.cpp
+++ b/ark/ops/ops_tensor.cpp
@@ -27,7 +27,7 @@ Tensor Model::tensor(const Dims &shape, const DataType &data_type,
                      const Dims &strides, const Dims &offsets,
                      const Dims &padded_shape, const std::string &name) {
     return impl_
-        ->create_op<ModelOpTensor>(name, nullptr, shape, data_type.ref(),
+        ->create_op<ModelOpTensor>("", name, nullptr, shape, data_type.ref(),
                                    strides, offsets, padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp
index 3f0ed0131..f099c7fb7 100644
--- a/ark/ops/ops_transpose.cpp
+++ b/ark/ops/ops_transpose.cpp
@@ -124,9 +124,10 @@ Json ModelOpTranspose::default_config([
 }
 
 Tensor Model::transpose(Tensor input, const std::vector<int64_t> &permutation,
-                        Tensor output, const std::string &name) {
+                        Tensor output, const std::string &config,
+                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpTranspose>(name, input.ref_, permutation,
+        ->create_op<ModelOpTranspose>(config, name, input.ref_, permutation,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/arkprof.py b/arkprof.py
index 9e67c2dfc..5fb62e118 100644
--- a/arkprof.py
+++ b/arkprof.py
@@ -1,6 +1,7 @@
 import ark
 import sys
 
+ark.init()
 ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
     iter=1000, profile_processor_groups=False
 )
diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py
new file mode 100644
index 000000000..fb01f0a0c
--- /dev/null
+++ b/examples/tutorial/context_tutorial.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import time
+import torch
+import torch.nn.functional as F
+
+
+class VanillaSoftmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        max = ark.reduce_max(input, axis=-1)
+        output = ark.sub(input, max)
+        output = ark.exp(output)
+        sum = ark.reduce_sum(output, axis=-1)
+        output = ark.div(output, sum)
+        return output
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.ContextManager(
+            processor_range=[0, 304],
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            task_id=0,
+        ):
+            max = ark.reduce_max(
+                input,
+                axis=-1,
+                config={
+                    "NumWarps": 1,
+                    "ImplType": "WarpWise",
+                    "SramBytes": 0,
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.sub(
+                input,
+                max,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.exp(
+                output,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            sum = ark.reduce_sum(
+                output,
+                axis=-1,
+                config={
+                    "NumWarps": 1,
+                    "ImplType": "WarpWise",
+                    "SramBytes": 0,
+                    "NumTasks": 65536,
+                },
+            )
+            output = ark.div(
+                output,
+                sum,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                    "Tile": [1, 2048],
+                    "NumTasks": 65536,
+                },
+            )
+            return output
+
+
+def eval(tensor: ark.Tensor):
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+        return tensor.to_torch()
+
+
+def perf():
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        start = time.time()
+        rt.run(iter=1000)
+        end = time.time()
+        return (end - start) / 1000
+
+
+if __name__ == "__main__":
+    ark.init()
+
+    shape = (32, 2048, 2048)
+
+    input = torch.randn(*shape).to("cuda:0")
+
+    output = Softmax()(ark.Tensor.from_torch(input))
+
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
+
+    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index e96972906..00370e683 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
@@ -102,3 +101,4 @@ def set_world_size(world_size):
 )
 from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
+from .context_manager import ContextManager
diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py
new file mode 100644
index 000000000..443e1ca5d
--- /dev/null
+++ b/python/ark/context_manager.py
@@ -0,0 +1,24 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+from .model import Model
+from ._ark_core import _ContextManager
+
+
+class ContextManager(_ContextManager):
+    def __init__(self, **kwargs):
+        context_map = {key: json.dumps(value) for key, value in kwargs.items()}
+        super().__init__(Model.get_model(), context_map)
+
+    def __enter__(self) -> "ContextManager":
+        """
+        Enter the context manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        """
+        Exit the context manager.
+        """
+        del self
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 86b021aef..509e3c891 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import List, Iterable, Union
+import json
+from typing import Any, Dict, List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
@@ -12,6 +13,12 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
+def _config_to_str(config: Union[str, Dict[str, Any]]) -> str:
+    if isinstance(config, str):
+        return config
+    return json.dumps(config)
+
+
 def _tensor(
     shape: Iterable[int],
     dtype: DataType = fp32,
@@ -50,6 +57,7 @@ def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "add",
 ) -> Union[Tensor, float]:
     """
@@ -73,12 +81,15 @@ def add(
         return input + other
     else:
         return Tensor(
-            Model.get_model().copy(input + other, output._tensor, name)
+            Model.get_model().copy(
+                input + other, output._tensor, _config_to_str(config), name
+            )
         )
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().add(a, b, output, name), runtime_id=input.runtime_id
+        Model.get_model().add(a, b, output, _config_to_str(config), name),
+        runtime_id=input.runtime_id,
     )
 
 
@@ -86,13 +97,16 @@ def cast(
     input: Tensor,
     dtype: DataType,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "cast",
 ) -> Tensor:
     """Type casting."""
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
+        Model.get_model().cast(
+            input._tensor, dtype.ctype(), output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -112,7 +126,10 @@ def constant(
 
 
 def copy(
-    input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy"
+    input: Union[Tensor, float],
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "copy",
 ) -> Tensor:
     """Data caopy."""
     if output is not NullTensor:
@@ -120,7 +137,7 @@ def copy(
     if isinstance(input, Tensor):
         intput = intput._tensor
     return Tensor(
-        Model.get_model().copy(intput, output, name),
+        Model.get_model().copy(intput, output, _config_to_str(config), name),
         runtime_id=input.runtime_id,
     )
 
@@ -129,6 +146,7 @@ def div(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "div",
 ) -> Tensor:
     """
@@ -144,7 +162,9 @@ def div(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().div(input._tensor, other, output, name),
+        Model.get_model().div(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -153,6 +173,7 @@ def embedding(
     input: Tensor,
     weight: Tensor,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
@@ -162,14 +183,17 @@ def embedding(
         output = output._tensor
     return Tensor(
         Model.get_model().embedding(
-            input._tensor, weight._tensor, output, name
+            input._tensor, weight._tensor, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
 
 
 def exp(
-    input: Tensor, output: Tensor = NullTensor, name: str = "exp"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "exp",
 ) -> Tensor:
     """
     Calculates the exponential of the `input` tensor, element-wise.
@@ -179,13 +203,18 @@ def exp(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().exp(input._tensor, output, name),
+        Model.get_model().exp(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def gelu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "gelu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "gelu",
 ) -> Tensor:
     """
     Applies the Gaussian Error Linear Unit (GELU) activation
@@ -198,7 +227,9 @@ def gelu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().gelu(input._tensor, output, name),
+        Model.get_model().gelu(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -230,6 +261,7 @@ def matmul(
     output: Tensor = NullTensor,
     transpose_input: bool = False,
     transpose_other: bool = False,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "matmul",
 ) -> Tensor:
     """
@@ -252,6 +284,7 @@ def matmul(
             output,
             transpose_input,
             transpose_other,
+            _config_to_str(config),
             name,
         ),
         runtime_id=input.runtime_id,
@@ -262,6 +295,7 @@ def mul(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "mul",
 ) -> Tensor:
     """
@@ -277,7 +311,9 @@ def mul(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().mul(input._tensor, other, output, name),
+        Model.get_model().mul(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -294,6 +330,7 @@ def reduce_max(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_max",
 ) -> Tensor:
     """
@@ -306,7 +343,7 @@ def reduce_max(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_max(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
@@ -317,6 +354,7 @@ def reduce_mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_mean",
 ) -> Tensor:
     """
@@ -329,7 +367,7 @@ def reduce_mean(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_mean(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
@@ -340,6 +378,7 @@ def reduce_sum(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_sum",
 ) -> Tensor:
     """
@@ -354,14 +393,17 @@ def reduce_sum(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_sum(
-            input._tensor, axis, keepdims, output, name
+            input._tensor, axis, keepdims, output, _config_to_str(config), name
         ),
         runtime_id=input.runtime_id,
     )
 
 
 def relu(
-    input: Tensor, output: Tensor = NullTensor, name: str = "relu"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "relu",
 ) -> Tensor:
     """
     Applies the ReLU activation function to the `input` tensor,
@@ -372,7 +414,9 @@ def relu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().relu(input._tensor, output, name),
+        Model.get_model().relu(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -411,6 +455,7 @@ def rope(
     input: Tensor,
     other: Tensor,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "rope",
 ) -> Tensor:
     """
@@ -423,13 +468,18 @@ def rope(
     if input.runtime_id != other.runtime_id:
         raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name),
+        Model.get_model().rope(
+            input._tensor, other._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def rsqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "rsqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -439,7 +489,9 @@ def rsqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().rsqrt(input._tensor, output, name),
+        Model.get_model().rsqrt(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -465,7 +517,10 @@ def sharding(
 
 
 def sigmoid(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "sigmoid",
 ) -> Tensor:
     """
     Applies the Sigmoid activation function to the `input` tensor,
@@ -476,13 +531,18 @@ def sigmoid(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sigmoid(input._tensor, output, name),
+        Model.get_model().sigmoid(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
 
 def sqrt(
-    input: Tensor, output: Tensor = NullTensor, name: str = "sqrt"
+    input: Tensor,
+    output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
+    name: str = "sqrt",
 ) -> Tensor:
     """
     Calculates the square root of the `input` tensor, element-wise.
@@ -492,7 +552,9 @@ def sqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sqrt(input._tensor, output, name),
+        Model.get_model().sqrt(
+            input._tensor, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -501,6 +563,7 @@ def sub(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "sub",
 ) -> Tensor:
     """
@@ -516,7 +579,9 @@ def sub(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().sub(input._tensor, other, output, name),
+        Model.get_model().sub(
+            input._tensor, other, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -546,6 +611,7 @@ def transpose(
     input: Tensor,
     perm: Iterable[int],
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "transpose",
 ) -> Tensor:
     """
@@ -565,7 +631,9 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name),
+        Model.get_model().transpose(
+            input._tensor, perm, output, _config_to_str(config), name
+        ),
         runtime_id=input.runtime_id,
     )
 
@@ -578,10 +646,11 @@ def mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
+    config: Union[str, Dict[str, Any]] = "",
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
-    return reduce_mean(input, axis, keepdims, output, name)
+    return reduce_mean(input, axis, keepdims, output, config, name)
 
 
 def ones(
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index c161b24e6..e47f5b7aa 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -21,8 +21,15 @@ class Profiler:
     def __init__(self, plan: Plan):
         self.plan = plan
 
-    def run(self, iter: int = 1000, loop_mode: bool = True, profile_processor_groups: bool = False):
-        sys.stderr.write(f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n")
+    def run(
+        self,
+        iter: int = 1000,
+        loop_mode: bool = True,
+        profile_processor_groups: bool = False,
+    ):
+        sys.stderr.write(
+            f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n"
+        )
 
         if not profile_processor_groups:
             return
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 1bc4255d6..7acd4ad1a 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,6 +7,7 @@
 
 namespace py = pybind11;
 
+extern void register_context_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -22,6 +23,7 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
+    register_context_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp
new file mode 100644
index 000000000..3d703a4bc
--- /dev/null
+++ b/python/context_manager_py.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/context_manager.hpp>
+
+namespace py = pybind11;
+
+void register_context_manager(py::module &m) {
+    py::class_<ark::ContextManager>(m, "_ContextManager")
+        .def(py::init<ark::Model&, const std::map<std::string, std::string>&>());
+}
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 2d1e5f634..ba17251d8 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -15,97 +15,109 @@ void register_model(py::module &m) {
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
         .def("rank", &ark::Model::rank)
         .def("world_size", &ark::Model::world_size)
-        .def("compress", &ark::Model::compress)
+        .def("compress", &ark::Model::compress, py::arg("merge_nodes") = false)
         .def("add",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::add),
+                               const std::string &, const std::string &>(
+                 &ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("add",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::add),
+                               const std::string &, const std::string &>(
+                 &ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("constant", &ark::Model::constant, py::arg("value"),
              py::arg("shape"), py::arg("data_type"), py::arg("name"))
         .def("copy",
-             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &>(
-                 &ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("name"))
+             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &,
+                               const std::string &>(&ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("config"),
+             py::arg("name"))
         .def("copy",
-             py::overload_cast<float, ark::Tensor, const std::string &>(
-                 &ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("name"))
+             py::overload_cast<float, ark::Tensor, const std::string &,
+                               const std::string &>(&ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("config"),
+             py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::div),
+                               const std::string &, const std::string &>(
+                 &ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::div),
+                               const std::string &, const std::string &>(
+                 &ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("embedding", &ark::Model::embedding, py::arg("input"),
-             py::arg("weight"), py::arg("output"), py::arg("name"))
-        .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
+             py::arg("weight"), py::arg("output"), py::arg("config"),
              py::arg("name"))
+        .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
+             py::arg("config"), py::arg("name"))
         .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("identity", &ark::Model::identity, py::arg("input"),
              py::arg("deps"), py::arg("name"))
         .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"),
              py::arg("output"), py::arg("trans_input"), py::arg("trans_other"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::mul),
+                               const std::string &, const std::string &>(
+                 &ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::mul),
+                               const std::string &, const std::string &>(
+                 &ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("reshape", &ark::Model::reshape, py::arg("input"),
              py::arg("shape"), py::arg("allowzero"), py::arg("name"))
         .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sharding", &ark::Model::sharding, py::arg("input"),
              py::arg("axis"), py::arg("dim_per_shard"), py::arg("name"))
         .def("sigmoid", &ark::Model::sigmoid, py::arg("input"),
-             py::arg("output"), py::arg("name"))
+             py::arg("output"), py::arg("config"), py::arg("name"))
         .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &>(&ark::Model::sub),
+                               const std::string &, const std::string &>(
+                 &ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &>(&ark::Model::sub),
+                               const std::string &, const std::string &>(
+                 &ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("name"))
+             py::arg("config"), py::arg("name"))
         .def("tensor", &ark::Model::tensor, py::arg("shape"),
              py::arg("data_type"), py::arg("strides"), py::arg("offsets"),
              py::arg("padded_shape"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
-             py::arg("permutation"), py::arg("output"), py::arg("name"));
+             py::arg("permutation"), py::arg("output"), py::arg("config"),
+             py::arg("name"));
 }

From ef3bb84e8ebb3bb86e256767802401e39d617a85 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 29 Jul 2024 20:31:14 +0000
Subject: [PATCH 041/106] plan manager

---
 ark/api/context_manager_test.cpp           |   1 -
 ark/api/model.cpp                          |   9 ++
 ark/api/plan_manager.cpp                   |  97 ++++++++++++++++
 ark/api/plan_manager_test.cpp              |  58 ++++++++++
 ark/api/planner.cpp                        | 125 +++++++++++++++------
 ark/include/ark/model.hpp                  |   9 +-
 ark/include/ark/model_graph.hpp            |   1 +
 ark/include/ark/plan_manager.hpp           |  25 +++++
 ark/model/model_graph_impl.cpp             |  16 ++-
 ark/model/model_graph_impl.hpp             |   6 +-
 examples/tutorial/context_tutorial.py      | 117 -------------------
 examples/tutorial/plan_manager_tutorial.py |  82 ++++++++++++++
 python/ark/__init__.py                     |   2 +-
 python/ark/context_manager.py              |  24 ----
 python/ark/plan_manager.py                 |  34 ++++++
 python/ark_py.cpp                          |   4 +-
 python/context_manager_py.cpp              |  15 ---
 python/plan_manager_py.cpp                 |  15 +++
 18 files changed, 440 insertions(+), 200 deletions(-)
 create mode 100644 ark/api/plan_manager.cpp
 create mode 100644 ark/api/plan_manager_test.cpp
 create mode 100644 ark/include/ark/plan_manager.hpp
 delete mode 100644 examples/tutorial/context_tutorial.py
 create mode 100644 examples/tutorial/plan_manager_tutorial.py
 delete mode 100644 python/ark/context_manager.py
 create mode 100644 python/ark/plan_manager.py
 delete mode 100644 python/context_manager_py.cpp
 create mode 100644 python/plan_manager_py.cpp

diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
index ff60b43bf..5fff94f34 100644
--- a/ark/api/context_manager_test.cpp
+++ b/ark/api/context_manager_test.cpp
@@ -1,7 +1,6 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include "ark/model.hpp"
 #include "ark/context_manager.hpp"
 
 #include "model/model_node.hpp"
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index a5a258f71..e9604c341 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -9,6 +9,15 @@
 
 namespace ark {
 
+Model::Model(int rank, int world_size) : ModelGraph(rank, world_size) {
+    static size_t next_id = 0;
+    id_ = next_id++;
+}
+
+Model::Model(const Model &other) : ModelGraph(other), id_(other.id()) {}
+
+size_t Model::id() const { return id_; }
+
 Model Model::compress(bool merge_nodes) const {
     Model model(*this);
     model.compress_nodes(merge_nodes);
diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
new file mode 100644
index 000000000..aee8d4f7b
--- /dev/null
+++ b/ark/api/plan_manager.cpp
@@ -0,0 +1,97 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/plan_manager.hpp"
+
+#include "logging.h"
+#include "model/model_json.hpp"
+#include "model/model_graph_impl.hpp"
+
+namespace ark {
+
+class PlanManagerState {
+   public:
+    PlanManagerState() : sync(true) {}
+    bool sync;
+};
+
+static std::map<size_t, PlanManagerState> gPlanManagerStates;
+
+PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) {
+    auto ctx = Json::parse(plan_context);
+    if (!ctx.is_object()) {
+        ERR(ModelError, "plan context must be a JSON object");
+    }
+    if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) {
+        gPlanManagerStates.emplace(model_id_, PlanManagerState());
+    }
+    auto& state = gPlanManagerStates[model_id_];
+    bool async = !state.sync;
+    std::map<std::string, std::string> context_map;
+    for (const auto& [key, value] : ctx.items()) {
+        if (key == "sync") {
+            if (!value.is_boolean()) {
+                ERR(ModelError, "sync must be a boolean");
+            }
+            if (state.sync && !value.get<bool>()) {
+                stop_sync_ = true;
+                state.sync = false;
+                context_map["AppendTask"] = "true";
+            } else if (!state.sync) {
+                context_map["AppendTask"] = "true";
+            }
+        } else if (key == "processor_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "processor_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring processor_range under sync=false context");
+                continue;
+            }
+            context_map["ProcessorRange"] = value.dump();
+        } else if (key == "warp_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "warp_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring warp_range under sync=false context");
+                continue;
+            }
+            context_map["WarpRange"] = value.dump();
+        } else if (key == "sram_range") {
+            if (!value.is_array()) {
+                ERR(ModelError, "sram_range must be an array");
+            }
+            if (async) {
+                LOG(WARN, "Ignoring sram_range under sync=false context");
+                continue;
+            }
+            context_map["SramRange"] = value.dump();
+        } else if (key == "config") {
+            if (!value.is_object()) {
+                ERR(ModelError, "config must be an object");
+            }
+            auto cfg = model.impl_->get_context("Config");
+            if (cfg.empty()) {
+                context_map["Config"] = value.dump();
+            } else {
+                auto cfg_obj = Json::parse(cfg);
+                for (const auto& [k, v] : value.items()) {
+                    cfg_obj[k] = v;
+                }
+                context_map["Config"] = cfg_obj.dump();
+            }
+        } else {
+            LOG(WARN, "Ignoring unknown plan context key: ", key);
+        }
+    }
+    context_manager_ = std::make_shared<ContextManager>(model, context_map);
+}
+
+PlanManager::~PlanManager() {
+    if (stop_sync_) {
+        gPlanManagerStates[model_id_].sync = true;
+    }
+}
+
+}  // namespace ark
diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp
new file mode 100644
index 000000000..78f5d4cb8
--- /dev/null
+++ b/ark/api/plan_manager_test.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/plan_manager.hpp"
+#include "ark/planner.hpp"
+
+#include "model/model_json.hpp"
+#include "unittest/unittest_utils.h"
+
+ark::unittest::State test_plan_manager() {
+    ark::Model model;
+    ark::Tensor t0 = model.tensor({1}, ark::FP32);
+    ark::Tensor t1 = model.tensor({1}, ark::FP32);
+    ark::Tensor t2 = model.add(t0, t1);
+
+    ark::Tensor t3;
+    ark::Tensor t4;
+    ark::Tensor t5;
+    ark::Tensor t6;
+    {
+        ark::PlanManager pm_0(model, ark::Json({
+            {"processor_range", {0, 2}},
+            {"warp_range", {0, 4}},
+            {"sram_range", {0, 0}},
+            {"sync", false}
+        }).dump());
+        t3 = model.relu(t2);
+        t4 = model.sqrt(t3);
+    }
+    {
+        ark::PlanManager pm_0(model, ark::Json({
+            {"processor_range", {2, 4}},
+            {"warp_range", {0, 4}},
+            {"sram_range", {0, 0}}
+        }).dump());
+        t5 = model.exp(t2);
+
+        ark::PlanManager pm_1(model, ark::Json({
+            {"processor_range", {2, 3}}
+        }).dump());
+        t6 = model.rsqrt(t5);
+    }
+
+    UNITTEST_TRUE(model.verify());
+
+    ark::DefaultPlanner planner(model, 0);
+    auto plan_str = planner.plan();
+    ark::Json plan = ark::Json::parse(plan_str);
+
+    UNITTEST_LOG(plan_str);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_plan_manager);
+    return 0;
+}
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index dba149a1e..1c40e5301 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -58,19 +58,35 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     size_t num_sm = gpu_info.num_sm;
     Json task_infos = Json::array();
     Json processor_groups = Json::array();
-    size_t max_num_warps = 1;
-    size_t max_num_processors = 1;
-    size_t next_node_id = 0;
+    size_t max_processor_id = 1;
+    size_t max_warp_id = 1;
+    size_t next_task_id = 0;
+    bool prev_append_task = false;
+    bool first_op = true;
+
+    auto get_context = [&](const ModelNodeRef &node,
+                           const std::string &key) -> Json {
+        if (node->context.find(key) != node->context.end()) {
+            return Json::parse(node->context.at(key));
+        }
+        return Json();
+    };
+
     for (const auto &node : model_.nodes()) {
+        std::string context = "";
+        for (const auto &[key, value] : node->context) {
+            context += key + "=" + value + ",";
+        }
+        context += "prev_append_task=" + std::to_string(prev_append_task);
+        LOG(INFO, context);
+
         for (const auto &op : node->ops) {
             if (op->is_virtual()) continue;
 
-            Json task_info;
-            task_info["Id"] = next_node_id++;
-
+            auto ctx_config = get_context(node, "Config");
             Json config;
-            if (!op->config().empty()) {
-                config = op->config();
+            if (!ctx_config.empty()) {
+                config = ctx_config;
             } else if (!config_rules_.empty()) {
                 const std::string op_str = op->serialize().dump();
                 for (auto &rule : config_rules_) {
@@ -90,31 +106,70 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             size_t num_warps = config["NumWarps"];
             size_t num_tasks = config["NumTasks"];
             size_t sram_bytes = config["SramBytes"];
-            task_info["NumWarps"] = num_warps;
-            task_info["SramBytes"] = sram_bytes;
-
-            max_num_warps = std::max(max_num_warps, num_warps);
-
-            task_info["Ops"] = Json::array();
-            task_info["Ops"].push_back(op->serialize());
-            task_info["Ops"][0]["Config"] = config;
-            task_infos.push_back(task_info);
-
-            Json resource_group;
-            size_t num_processors = std::min(num_sm, num_tasks);
-            max_num_processors = std::max(max_num_processors, num_processors);
-            resource_group["ProcessorRange"] = {0, num_processors};
-            resource_group["WarpRange"] = {0, num_warps};
-            resource_group["SramRange"] = {0, sram_bytes};
-            resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
-                                             {"TaskRange", {0, num_tasks}},
-                                             {"Granularity", 1}}};
-
-            Json processor_group;
-            processor_group["ProcessorRange"] = {0, num_processors};
-            processor_group["ResourceGroups"] = Json::array();
-            processor_group["ResourceGroups"].push_back(resource_group);
-            processor_groups.push_back(processor_group);
+
+            auto ctx_append_task = get_context(node, "AppendTask");
+            if (!ctx_append_task.empty() && ctx_append_task.get<bool>() &&
+                prev_append_task) {
+                auto &task_info = task_infos.back();
+                task_info["NumWarps"] =
+                    std::max(task_info["NumWarps"].get<size_t>(), num_warps);
+                task_info["SramBytes"] =
+                    std::max(task_info["SramBytes"].get<size_t>(), sram_bytes);
+                task_info["Ops"].push_back(op->serialize());
+                task_info["Ops"].back()["Config"] = config;
+            } else {
+                Json task_info;
+                task_info["Id"] = first_op ? next_task_id : ++next_task_id;
+                task_info["NumWarps"] = num_warps;
+                task_info["SramBytes"] = sram_bytes;
+                task_info["Ops"] = Json::array();
+                task_info["Ops"].push_back(op->serialize());
+                task_info["Ops"][0]["Config"] = config;
+                task_infos.push_back(task_info);
+
+                auto ctx_processor_range = get_context(node, "ProcessorRange");
+                auto ctx_warp_range = get_context(node, "WarpRange");
+                auto ctx_sram_range = get_context(node, "SramRange");
+
+                Json processor_group;
+                if (!ctx_processor_range.empty()) {
+                    processor_group["ProcessorRange"] = ctx_processor_range;
+                    max_processor_id = std::max(
+                        max_processor_id, ctx_processor_range[1].get<size_t>());
+                } else {
+                    size_t num_processors = std::min(num_sm, num_tasks);
+                    processor_group["ProcessorRange"] = {0, num_processors};
+                    max_processor_id =
+                        std::max(max_processor_id, num_processors);
+                }
+
+                Json resource_group;
+                resource_group["ProcessorRange"] =
+                    processor_group["ProcessorRange"];
+                if (!ctx_warp_range.empty()) {
+                    resource_group["WarpRange"] = ctx_warp_range;
+                    max_warp_id =
+                        std::max(max_warp_id, ctx_warp_range[1].get<size_t>());
+                } else {
+                    resource_group["WarpRange"] = {0, num_warps};
+                    max_warp_id = std::max(max_warp_id, num_warps);
+                }
+                if (!ctx_sram_range.empty()) {
+                    resource_group["SramRange"] = ctx_sram_range;
+                } else {
+                    resource_group["SramRange"] = {0, sram_bytes};
+                }
+                resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
+                                                 {"TaskRange", {0, num_tasks}},
+                                                 {"Granularity", 1}}};
+
+                processor_group["ResourceGroups"] = Json::array();
+                processor_group["ResourceGroups"].push_back(resource_group);
+                processor_groups.push_back(processor_group);
+            }
+            prev_append_task =
+                !ctx_append_task.empty() && ctx_append_task.get<bool>();
+            first_op = false;
         }
     }
 
@@ -122,8 +177,8 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     plan["Rank"] = model_.rank();
     plan["WorldSize"] = model_.world_size();
     plan["Architecture"] = gpu_info.arch->name();
-    plan["NumProcessors"] = max_num_processors;
-    plan["NumWarpsPerProcessor"] = max_num_warps;
+    plan["NumProcessors"] = max_processor_id;
+    plan["NumWarpsPerProcessor"] = max_warp_id;
     plan["TaskInfos"] = task_infos;
     plan["ProcessorGroups"] = processor_groups;
 
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 35efe53d5..e0b17be52 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -17,15 +17,20 @@ namespace ark {
 
 class Model : public ModelGraph {
    private:
+    size_t id_;
     std::set<int> tags_;
 
    public:
-    Model(int rank = 0, int world_size = 1) : ModelGraph(rank, world_size) {}
-    Model(const Model &other) : ModelGraph(other) {}
+    Model(int rank = 0, int world_size = 1);
+
+    Model(const Model &other);
+
     ~Model() {}
 
     Model &operator=(const Model &other) = default;
 
+    size_t id() const;
+
     Model compress(bool merge_nodes = false) const;
 
     int unique_tag();
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index f6390a2a9..c53c98c3a 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -38,6 +38,7 @@ class ModelGraph {
 
    protected:
     friend class Model;
+    friend class PlanManager;
     friend class ContextManager;
 
     class Impl;
diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp
new file mode 100644
index 000000000..3952a1c06
--- /dev/null
+++ b/ark/include/ark/plan_manager.hpp
@@ -0,0 +1,25 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_PLAN_MANAGER_HPP
+#define ARK_PLAN_MANAGER_HPP
+
+#include <ark/context_manager.hpp>
+
+namespace ark {
+
+class PlanManager {
+   public:
+    PlanManager(Model& model, const std::string& plan_context);
+
+    ~PlanManager();
+
+   private:
+    size_t model_id_;
+    bool stop_sync_;
+    std::shared_ptr<ContextManager> context_manager_;
+};
+
+}  // namespace ark
+
+#endif  // ARK_PLAN_MANAGER_HPP
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 53a7fa851..385424e57 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -37,7 +37,15 @@ void ModelGraphContextStack::pop(const std::string &key) {
     it->second.pop_back();
 }
 
-std::map<std::string, std::string> ModelGraphContextStack::current_context() const {
+std::string ModelGraphContextStack::get_context(const std::string &key) const {
+    if (this->storage_.find(key) == this->storage_.end() ||
+        this->storage_.at(key).empty()) {
+        return "";
+    }
+    return *this->storage_.at(key).back();
+}
+
+std::map<std::string, std::string> ModelGraphContextStack::get_context_all() const {
     std::map<std::string, std::string> cur;
     for (const auto &pair : this->storage_) {
         if (!pair.second.empty()) {
@@ -167,6 +175,10 @@ bool ModelGraph::Impl::verify() const {
     return true;
 }
 
+std::string ModelGraph::Impl::get_context(const std::string &key) const {
+    return context_stack_->get_context(key);
+}
+
 ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
     for (auto &tns : op->input_tensors()) {
         if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) {
@@ -205,7 +217,7 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
         producer->consumers.push_back(node);
     }
 
-    node->context = context_stack_->current_context();
+    node->context = context_stack_->get_context_all();
 
     nodes_.push_back(node);
     return node;
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index fbfc54c7e..ec255423e 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -34,7 +34,9 @@ class ModelGraphContextStack {
 
     void pop(const std::string &key);
 
-    std::map<std::string, std::string> current_context() const;
+    std::string get_context(const std::string &key) const;
+
+    std::map<std::string, std::string> get_context_all() const;
 };
 
 class ModelGraph::Impl {
@@ -80,6 +82,8 @@ class ModelGraph::Impl {
 
     bool verify() const;
 
+    std::string get_context(const std::string &key) const;
+
     std::string serialize(bool pretty = true) const;
 
     std::vector<ModelNodeRef> nodes() const;
diff --git a/examples/tutorial/context_tutorial.py b/examples/tutorial/context_tutorial.py
deleted file mode 100644
index fb01f0a0c..000000000
--- a/examples/tutorial/context_tutorial.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import time
-import torch
-import torch.nn.functional as F
-
-
-class VanillaSoftmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        max = ark.reduce_max(input, axis=-1)
-        output = ark.sub(input, max)
-        output = ark.exp(output)
-        sum = ark.reduce_sum(output, axis=-1)
-        output = ark.div(output, sum)
-        return output
-
-
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with ark.ContextManager(
-            processor_range=[0, 304],
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            task_id=0,
-        ):
-            max = ark.reduce_max(
-                input,
-                axis=-1,
-                config={
-                    "NumWarps": 1,
-                    "ImplType": "WarpWise",
-                    "SramBytes": 0,
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.sub(
-                input,
-                max,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.exp(
-                output,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            sum = ark.reduce_sum(
-                output,
-                axis=-1,
-                config={
-                    "NumWarps": 1,
-                    "ImplType": "WarpWise",
-                    "SramBytes": 0,
-                    "NumTasks": 65536,
-                },
-            )
-            output = ark.div(
-                output,
-                sum,
-                config={
-                    "NumWarps": 1,
-                    "SramBytes": 0,
-                    "Tile": [1, 2048],
-                    "NumTasks": 65536,
-                },
-            )
-            return output
-
-
-def eval(tensor: ark.Tensor):
-    with ark.Runtime() as rt:
-        rt.launch()
-        rt.run()
-        return tensor.to_torch()
-
-
-def perf():
-    with ark.Runtime() as rt:
-        rt.launch()
-
-        start = time.time()
-        rt.run(iter=1000)
-        end = time.time()
-        return (end - start) / 1000
-
-
-if __name__ == "__main__":
-    ark.init()
-
-    shape = (32, 2048, 2048)
-
-    input = torch.randn(*shape).to("cuda:0")
-
-    output = Softmax()(ark.Tensor.from_torch(input))
-
-    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-        print("Correct result")
-    else:
-        print("Incorrect result")
-
-    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
new file mode 100644
index 000000000..25aca7af6
--- /dev/null
+++ b/examples/tutorial/plan_manager_tutorial.py
@@ -0,0 +1,82 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import time
+import torch
+import torch.nn.functional as F
+
+
+class VanillaSoftmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        max = ark.reduce_max(input, axis=-1)
+        output = ark.sub(input, max)
+        output = ark.exp(output)
+        sum = ark.reduce_sum(output, axis=-1)
+        output = ark.div(output, sum)
+        return output
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.PlanManager(
+            processor_range=[0, 304],
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 65536,
+            }
+        ):
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
+def eval(tensor: ark.Tensor):
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+        return tensor.to_torch()
+
+
+def perf():
+    with ark.Runtime() as rt:
+        rt.launch()
+
+        start = time.time()
+        rt.run(iter=1000)
+        end = time.time()
+        return (end - start) / 1000
+
+
+if __name__ == "__main__":
+    ark.init()
+
+    shape = (32, 2048, 2048)
+
+    input = torch.randn(*shape).to("cuda:0")
+
+    output = Softmax()(ark.Tensor.from_torch(input))
+
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
+
+    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 00370e683..db19b59d4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -101,4 +101,4 @@ def set_world_size(world_size):
 )
 from .planner import DefaultPlanner, Plan
 from .profiler import Profiler
-from .context_manager import ContextManager
+from .plan_manager import PlanManager
diff --git a/python/ark/context_manager.py b/python/ark/context_manager.py
deleted file mode 100644
index 443e1ca5d..000000000
--- a/python/ark/context_manager.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-from .model import Model
-from ._ark_core import _ContextManager
-
-
-class ContextManager(_ContextManager):
-    def __init__(self, **kwargs):
-        context_map = {key: json.dumps(value) for key, value in kwargs.items()}
-        super().__init__(Model.get_model(), context_map)
-
-    def __enter__(self) -> "ContextManager":
-        """
-        Enter the context manager.
-        """
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        """
-        Exit the context manager.
-        """
-        del self
diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py
new file mode 100644
index 000000000..80e615ab8
--- /dev/null
+++ b/python/ark/plan_manager.py
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import json
+from typing import List, Dict, Any
+from .model import Model
+from ._ark_core import _PlanManager
+
+
+class PlanManager(_PlanManager):
+    def __init__(self, **kwargs):
+        """
+        Plan manager for specifying the parallelization and tiling configuration of the operators in the context.
+
+        Args:
+            processor_range (List[int], optional): The range of processors to be used. Defaults to None.
+            warp_range (List[int], optional): The range of warps to be used. Defaults to None.
+            sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None.
+            sync (bool, optional): Whether to synchronize the execution. Defaults to True.
+            config (Dict[str, Any], optional): The configuration for the operators. Defaults to None.
+        """
+        super().__init__(Model.get_model(), json.dumps(kwargs))
+
+    def __enter__(self) -> "PlanManager":
+        """
+        Enter the plan manager.
+        """
+        return self
+
+    def __exit__(self, exc_type, exc_value, exc_tb):
+        """
+        Exit the plan manager.
+        """
+        del self
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 7acd4ad1a..75788ba55 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,7 +7,7 @@
 
 namespace py = pybind11;
 
-extern void register_context_manager(py::module &m);
+extern void register_plan_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -23,7 +23,7 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
-    register_context_manager(m);
+    register_plan_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/context_manager_py.cpp b/python/context_manager_py.cpp
deleted file mode 100644
index 3d703a4bc..000000000
--- a/python/context_manager_py.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <ark/context_manager.hpp>
-
-namespace py = pybind11;
-
-void register_context_manager(py::module &m) {
-    py::class_<ark::ContextManager>(m, "_ContextManager")
-        .def(py::init<ark::Model&, const std::map<std::string, std::string>&>());
-}
diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp
new file mode 100644
index 000000000..34aa0b77c
--- /dev/null
+++ b/python/plan_manager_py.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/plan_manager.hpp>
+
+namespace py = pybind11;
+
+void register_plan_manager(py::module &m) {
+    py::class_<ark::PlanManager>(m, "_PlanManager")
+        .def(py::init<ark::Model&, const std::string&>());
+}

From 7a7f70e43d3e6e327abf5fe835fad1902c803ca0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 30 Jul 2024 04:45:27 +0000
Subject: [PATCH 042/106] fix

---
 ark/api/plan_manager.cpp                   |  8 ++++----
 ark/api/planner.cpp                        | 22 ++++++++--------------
 examples/tutorial/plan_manager_tutorial.py |  3 +--
 python/ark/tensor.py                       |  7 +++++--
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
index aee8d4f7b..8cb1940b1 100644
--- a/ark/api/plan_manager.cpp
+++ b/ark/api/plan_manager.cpp
@@ -17,7 +17,9 @@ class PlanManagerState {
 
 static std::map<size_t, PlanManagerState> gPlanManagerStates;
 
-PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_id_(model.id()), stop_sync_(false) {
+PlanManager::PlanManager(Model& model, const std::string& plan_context)
+    : model_id_(model.id()), stop_sync_(false) {
+    static int task_group_id = 0;
     auto ctx = Json::parse(plan_context);
     if (!ctx.is_object()) {
         ERR(ModelError, "plan context must be a JSON object");
@@ -36,9 +38,7 @@ PlanManager::PlanManager(Model& model, const std::string& plan_context) : model_
             if (state.sync && !value.get<bool>()) {
                 stop_sync_ = true;
                 state.sync = false;
-                context_map["AppendTask"] = "true";
-            } else if (!state.sync) {
-                context_map["AppendTask"] = "true";
+                context_map["TaskGroupId"] = std::to_string(task_group_id++);
             }
         } else if (key == "processor_range") {
             if (!value.is_array()) {
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 1c40e5301..032be0d6f 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -61,7 +61,7 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     size_t max_processor_id = 1;
     size_t max_warp_id = 1;
     size_t next_task_id = 0;
-    bool prev_append_task = false;
+    int prev_task_group_id = -1;
     bool first_op = true;
 
     auto get_context = [&](const ModelNodeRef &node,
@@ -73,13 +73,6 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
     };
 
     for (const auto &node : model_.nodes()) {
-        std::string context = "";
-        for (const auto &[key, value] : node->context) {
-            context += key + "=" + value + ",";
-        }
-        context += "prev_append_task=" + std::to_string(prev_append_task);
-        LOG(INFO, context);
-
         for (const auto &op : node->ops) {
             if (op->is_virtual()) continue;
 
@@ -106,10 +99,12 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
             size_t num_warps = config["NumWarps"];
             size_t num_tasks = config["NumTasks"];
             size_t sram_bytes = config["SramBytes"];
+            size_t granularity = config.value("Granularity", 1);
 
-            auto ctx_append_task = get_context(node, "AppendTask");
-            if (!ctx_append_task.empty() && ctx_append_task.get<bool>() &&
-                prev_append_task) {
+            auto ctx_task_group_id = get_context(node, "TaskGroupId");
+            int task_group_id =
+                ctx_task_group_id.empty() ? -1 : ctx_task_group_id.get<int>();
+            if (task_group_id != -1 && task_group_id == prev_task_group_id) {
                 auto &task_info = task_infos.back();
                 task_info["NumWarps"] =
                     std::max(task_info["NumWarps"].get<size_t>(), num_warps);
@@ -161,14 +156,13 @@ std::string DefaultPlanner::Impl::plan(bool pretty) const {
                 }
                 resource_group["TaskGroups"] = {{{"TaskId", task_info["Id"]},
                                                  {"TaskRange", {0, num_tasks}},
-                                                 {"Granularity", 1}}};
+                                                 {"Granularity", granularity}}};
 
                 processor_group["ResourceGroups"] = Json::array();
                 processor_group["ResourceGroups"].push_back(resource_group);
                 processor_groups.push_back(processor_group);
             }
-            prev_append_task =
-                !ctx_append_task.empty() && ctx_append_task.get<bool>();
+            prev_task_group_id = task_group_id;
             first_op = false;
         }
     }
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
index 25aca7af6..c840ce0c0 100644
--- a/examples/tutorial/plan_manager_tutorial.py
+++ b/examples/tutorial/plan_manager_tutorial.py
@@ -26,7 +26,6 @@ def __init__(self):
 
     def forward(self, input):
         with ark.PlanManager(
-            processor_range=[0, 304],
             warp_range=[0, 8],
             sram_range=[0, 0],
             sync=False,
@@ -34,7 +33,7 @@ def forward(self, input):
                 "NumWarps": 1,
                 "SramBytes": 0,
                 "NumTasks": 65536,
-            }
+            },
         ):
             with ark.PlanManager(config={"ImplType": "WarpWise"}):
                 max = ark.reduce_max(input, axis=-1)
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 657da1065..eed7a4259 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -193,7 +193,9 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
         ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
         return Tensor(ark_tensor, runtime_id=runtime_id)
 
-    def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
+    def copy(
+        self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
+    ) -> "Tensor":
         """
         Copies data into this tensor. The data type may differ,
         but the size must match.
@@ -214,6 +216,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 self._tensor,
                 data.data_ptr(),
                 tensor_bytes,
+                stream,
                 data.device.type == "cuda",
             )
         elif isinstance(data, np.ndarray):
@@ -221,7 +224,7 @@ def copy(self, data: Union[np.ndarray, torch.Tensor]) -> "Tensor":
                 data = np.ascontiguousarray(data)
             if data.nbytes != tensor_bytes:
                 raise ValueError("data size does not match the tensor")
-            rt.executor.tensor_write(self._tensor, data)
+            rt.executor.tensor_write(self._tensor, data, stream)
         else:
             raise ValueError("data must be a numpy array or a torch tensor")
         return self

From a77a2ea6b864562f4e916dbaaf30f82e080aad93 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 30 Jul 2024 05:48:00 +0000
Subject: [PATCH 043/106] llama example

---
 examples/llama/model_7b_b1_s2048.py | 704 ++++++++++++++++++++++++++++
 examples/llama/model_test.py        |   6 +-
 2 files changed, 708 insertions(+), 2 deletions(-)
 create mode 100644 examples/llama/model_7b_b1_s2048.py

diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
new file mode 100644
index 000000000..f41304e85
--- /dev/null
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -0,0 +1,704 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+"""LLaMA 2 Transformer model.
+   Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py
+"""
+
+import ark
+import math
+from dataclasses import dataclass
+from typing import Optional
+import os
+
+
+@dataclass
+class ModelArgs:
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs7B(ModelArgs):
+    dim: int = 4096
+    n_layers: int = 32
+    n_heads: int = 32
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs13B(ModelArgs):
+    dim: int = 5120
+    n_layers: int = 40
+    n_heads: int = 40
+    n_kv_heads: Optional[int] = None
+    vocab_size: int = -1  # defined later by tokenizer
+    multiple_of: int = (
+        256  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = None
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 2048
+
+
+@dataclass
+class ModelArgs70B(ModelArgs):
+    dim: int = 8192
+    n_layers: int = 80
+    n_heads: int = 64
+    n_kv_heads: Optional[int] = 8
+    vocab_size: int = -1
+    multiple_of: int = (
+        4096  # make SwiGLU hidden layer size multiple of large power of 2
+    )
+    ffn_dim_multiplier: Optional[float] = 1.3
+    norm_eps: float = 1e-5
+    max_batch_size: int = 32
+    max_seq_len: int = 4096
+
+
+class RMSNorm(ark.Module):
+    """
+    Root mean square layer normalization (RMSNorm).
+    """
+
+    def __init__(
+        self, dim: int, eps: float = 1e-6, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.eps = eps
+        self.dtype = dtype
+        self.weight = ark.parameter([1, 1, dim], ark.fp32)
+
+    def forward(self, x):
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Granularity": 7,
+            },
+        ):
+            with ark.PlanManager(config={"Tile": [1, 4096]}):
+                x = ark.cast(x, ark.fp32)
+                x2 = ark.mul(x, x)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                mean = ark.reduce_mean(x2, axis=-1)
+        with ark.PlanManager(
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Tile": [64, 1],
+                "NumTasks": 32,
+            }
+        ):
+            rrms = ark.rsqrt(mean)
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 2048,
+                "Tile": [1, 4096],
+                "Granularity": 7,
+            },
+        ):
+            x = ark.mul(x, rrms)
+            x = ark.mul(x, self.weight, x)
+            return ark.cast(x, self.dtype)
+
+
+class ColumnParallelLinear(ark.Module):
+    """Linear layer with column parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its second dimension as A = [A_1, ..., A_p].
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its first dimension.
+
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        gather_output: bool = True,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.gather_output = gather_output
+
+        self.weight = ark.parameter([out_dim // world_size, in_dim], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1 or self.gather_output == False:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        # We need to concat the output_tensor_shards along the last dimension
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor,
+            axis=2,
+            dim_per_shard=self.out_dim // self.world_size,
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        # (batch_size, seq_len, out_dim // world_size)
+        local_result = ark.matmul(
+            x, self.weight, local_result, transpose_other=True
+        )
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        # return gather_input
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class RowParallelLinear(ark.Module):
+    """Linear layer with row parallelism.
+
+    The linear layer is defined as Y = XA + b. A is parallelized along
+    its first dimension and X along its second dimension as:
+               -   -
+              | A_1 |
+              | .   |
+          A = | .   |        X = [X_1, ..., X_p]
+              | .   |
+              | A_p |
+               -   -
+
+    Here the weight = A^T, so we need to partition the weight matrix along
+    its second dimension.
+    """
+
+    def __init__(
+        self,
+        in_dim: int,
+        out_dim: int,
+        dtype: ark.DataType = ark.fp16,
+        input_is_parallel: bool = False,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.dtype = dtype
+        self.local_rank = local_rank
+        self.world_size = world_size
+        self.input_is_parallel = input_is_parallel
+
+        self.weight = ark.parameter([out_dim, in_dim // world_size], dtype)
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.matmul(x, self.weight, transpose_other=True)
+        x_ndims = len(x.shape())
+        if self.input_is_parallel:
+            input_parallel = x
+        else:
+            x_shards = ark.sharding(
+                x, x_ndims - 1, self.in_dim // self.world_size
+            )
+            input_parallel = x_shards[self.local_rank]
+        local_result = ark.matmul(
+            input_parallel, self.weight, transpose_other=True
+        )
+        reduced_result = ark.local_all_reduce(
+            local_result, self.local_rank, self.world_size
+        )
+        return reduced_result
+
+
+class ParallelEmbedding(ark.Module):
+    """Embedding layer."""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        dim: int,
+        dtype: ark.DataType,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.dim = dim
+        self.weight = ark.parameter([vocab_size, dim // world_size], dtype)
+        self.out_dim = dim
+        self.dtype = dtype
+        self.world_size = world_size
+        self.local_rank = local_rank
+
+    def forward(self, x):
+        if self.world_size == 1:
+            return ark.embedding(x, self.weight)
+
+        output_tensor = ark.tensor(
+            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
+        )
+        output_tensor_shards = ark.sharding(
+            output_tensor, axis=2, dim_per_shard=self.out_dim // self.world_size
+        )
+        local_result = ark.identity(
+            output_tensor_shards[self.local_rank], deps=output_tensor_shards
+        )
+        local_result = ark.embedding(x, self.weight, local_result)
+        gather_input = ark.identity(output_tensor, deps=[local_result])
+        gather_reshape = ark.reshape(
+            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
+        )
+        gather_out = ark.local_all_gather(
+            gather_reshape, self.local_rank, self.world_size, 1
+        )
+        return ark.reshape(
+            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
+        )
+
+
+class Linear(ark.Module):
+    """
+    Linear layer module with weights and no bias.
+    """
+
+    def __init__(
+        self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16
+    ):
+        super().__init__()
+        self.dtype = dtype
+        self.weight = ark.parameter([out_dim, in_dim], dtype)
+
+    def forward(self, x):
+        return ark.matmul(x, self.weight, transpose_other=True)
+
+
+class Silu(ark.Module):
+    """
+    Silu activation function, silu(x) = x * sigmoid(x)
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: ark.Tensor):
+        # We need to specify output tensor so that the sigmoid op will not be an in-place operator
+        output = ark.tensor(x.shape(), x.dtype())
+        x1 = ark.sigmoid(x, output)
+        return ark.mul(x, x1)
+
+
+class FeedForward(ark.Module):
+    def __init__(
+        self,
+        dim: int,
+        hidden_dim: int,
+        multiple_of: int,
+        ffn_dim_multiplier: Optional[float],
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if ffn_dim_multiplier is not None:
+            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
+        hidden_dim = multiple_of * (
+            (hidden_dim + multiple_of - 1) // multiple_of
+        )
+
+        self.w1 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+        self.w2 = RowParallelLinear(
+            hidden_dim, dim, dtype, True, local_rank, world_size
+        )
+        self.w3 = ColumnParallelLinear(
+            dim, hidden_dim, dtype, False, local_rank, world_size
+        )
+
+    def forward(self, x):
+        # self.w2(F.silu(self.w1(x)) * self.w3(x))
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x1 = self.w1(x)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x1 = Silu()(x1)
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 688,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                x2 = self.w3(x)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x3 = ark.mul(x1, x2)
+        x4 = self.w2(x3)
+        return x4
+
+
+def apply_rotary_emb(xq, xk, freqs_cis):
+    """
+    Apply rotary embeddings to xq and xk.
+    """
+    xq_out = ark.rope(xq, freqs_cis)
+    xk_out = ark.rope(xk, freqs_cis)
+    return xq_out, xk_out
+
+
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "NumTasks": 65536,
+            },
+        ):
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with ark.PlanManager(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
+class Attention(ark.Module):
+    def __init__(
+        self,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_kv_heads = (
+            args.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        )
+        model_parallel_size = world_size
+        self.dtype = dtype
+        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
+        self.n_rep = self.n_local_heads // self.n_local_kv_heads
+        self.head_dim = args.dim // args.n_heads
+        self.wq = ColumnParallelLinear(
+            args.dim,
+            args.n_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wk = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wv = ColumnParallelLinear(
+            args.dim,
+            self.n_kv_heads * self.head_dim,
+            dtype,
+            False,
+            local_rank,
+            world_size,
+        )
+        self.wo = RowParallelLinear(
+            args.n_heads * self.head_dim,
+            args.dim,
+            dtype,
+            True,
+            local_rank,
+            world_size,
+        )
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        bsz, seqlen, _ = x.shape()
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xq = self.wq(x)
+            xq = ark.reshape(
+                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+            )
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xq = ark.rope(xq, freqs_cis)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                xq = ark.transpose(xq, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4, "NumTasks": 256},
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xk = self.wk(x)
+            xk = ark.reshape(
+                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                if freqs_cis is not None:
+                    xk = ark.rope(xk, freqs_cis)
+            keys = xk
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                keys = ark.transpose(keys, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+                "SramBytes": 24672,
+                "TileShapeMNK": [256, 128, 32],
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                xv = self.wv(x)
+            xv = ark.reshape(
+                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            values = xv
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                values = ark.transpose(values, [0, 2, 1, 3])
+
+        with ark.PlanManager(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 4096,
+                "Granularity": 2,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                scores = ark.matmul(xq, keys, transpose_other=True)
+            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+                scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
+
+        if mask is not None:
+            scores = ark.add(scores, mask)
+
+        scores = Softmax()(scores)
+
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "NumTasks": 256,
+            },
+        ):
+            with ark.PlanManager(
+                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
+            ):
+                output = ark.matmul(scores, values)
+            with ark.PlanManager(
+                config={"SramBytes": 0, "Tile": [256, 1, 128]}
+            ):
+                output = ark.transpose(output, [0, 2, 1, 3])
+        output = ark.reshape(
+            output, [bsz, seqlen, self.head_dim * self.n_local_heads]
+        )
+        return self.wo(output)
+
+
+class TransformerBlock(ark.Module):
+    def __init__(
+        self,
+        layer_id: int,
+        args: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.n_heads = args.n_heads
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+        self.attention = Attention(args, dtype, local_rank, world_size)
+        self.feed_forward = FeedForward(
+            dim=args.dim,
+            hidden_dim=4 * args.dim,
+            multiple_of=args.multiple_of,
+            ffn_dim_multiplier=args.ffn_dim_multiplier,
+            dtype=dtype,
+            local_rank=local_rank,
+            world_size=world_size,
+        )
+        self.layer_id = layer_id
+        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
+
+    def forward(
+        self,
+        x: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        attention_norm_x = self.attention_norm(x)
+        h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            h = ark.add(x, h)
+        ff = self.feed_forward(self.ffn_norm(h))
+        with ark.PlanManager(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "NumTasks": 256,
+                "SramBytes": 0,
+            },
+        ):
+            out = ark.add(h, ff)
+        return out
+
+
+class Transformer(ark.Module):
+    def __init__(
+        self,
+        params: ModelArgs,
+        dtype: ark.DataType = ark.fp16,
+        local_rank: int = 0,
+        world_size: int = 1,
+    ):
+        super().__init__()
+        self.params = params
+        self.vocab_size = params.vocab_size
+        self.n_layers = params.n_layers
+
+        self.tok_embeddings = ParallelEmbedding(
+            params.vocab_size, params.dim, dtype, local_rank, world_size
+        )
+
+        self.layers = []
+        for layer_id in range(self.n_layers):
+            self.layers.append(
+                TransformerBlock(
+                    layer_id, params, dtype, local_rank, world_size
+                )
+            )
+            self.register_module(f"layers.{layer_id}", self.layers[layer_id])
+        self.norm = RMSNorm(params.dim, eps=params.norm_eps, dtype=dtype)
+        self.output = ColumnParallelLinear(
+            params.dim, params.vocab_size, dtype, True, local_rank, world_size
+        )
+
+    def forward(
+        self,
+        tokens: ark.Tensor,
+        start_pos: int,
+        freqs_cis: ark.Tensor,
+        mask: Optional[ark.Tensor],
+    ):
+        h = self.tok_embeddings(tokens)
+
+        for layer in self.layers:
+            h = layer(h, start_pos, freqs_cis, mask)
+        h = self.norm(h)
+        output = self.output(h)
+        return output
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 19c680854..f559a826b 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,8 +59,10 @@ def run_ark(
     output = module(*module_inputs)
 
     with ark.Runtime() as rt:
-        plan = ark.Plan.from_file("plan_llama2_7b_b1_s2048.json")
-        rt.launch(plan)
+        plan = ark.DefaultPlanner().plan()
+        with open("plan.json", "w") as f:
+            f.write(str(plan))
+        rt.launch(plan=plan)
 
         # Load model parameters
         if state_dict:

From 78ac0dacb70e26ef5dc8704c0bb69c7c47240cbd Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 08:06:32 +0000
Subject: [PATCH 044/106] fix merge

---
 ark/include/ark/executor.hpp | 2 +-
 ark/ops/ops_test_common.cpp  | 2 +-
 ark/ops/ops_test_common.hpp  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index f0a108a1f..3744c33db 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -95,7 +95,7 @@ class DefaultExecutor : public Executor {
    public:
     DefaultExecutor(
         const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+        const std::vector<Planner::ConfigRule> &config_rules = {},
         const std::string &name = "DefaultExecutor", bool loop_mode = true);
 };
 
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 2bd9ce2e7..4e94d06a7 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -35,7 +35,7 @@ OpsTestResult op_test(
     const std::string &test_name_prefix, const Model &model,
     const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
     OpsTestBaseline baseline, const std::vector<void *> &inputs_data,
-    const std::vector<DefaultPlanner::ConfigRule> &config_rules,
+    const std::vector<Planner::ConfigRule> &config_rules,
     bool print_on_error) {
     DefaultExecutor exe(model, -1, nullptr, config_rules);
     exe.compile();
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index c5d640f3b..3848773e6 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -171,7 +171,7 @@ OpsTestResult op_test(
     const std::string &test_name_prefix, const Model &model,
     const std::vector<Tensor> &inputs, const std::vector<Tensor> &outputs,
     OpsTestBaseline baseline, const std::vector<void *> &inputs_data = {},
-    const std::vector<DefaultPlanner::ConfigRule> &config_rules = {},
+    const std::vector<Planner::ConfigRule> &config_rules = {},
     bool print_on_error = false);
 
 OpsTestGpuMem to_gpu(void *host_ptr, size_t size);

From afb518a7622363b000e9fc1d21c4cf8178c3461d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 08:09:48 +0000
Subject: [PATCH 045/106] fix merge

---
 ark/api/executor.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 58d058d25..42ed45128 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -233,8 +233,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     }
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
-
-    auto gpu_manager = GpuManager::get_instance(gpu_id_);
     if (!gpu_manager->info().arch->belongs_to(
             Arch::from_name(plan_json.at("Architecture")))) {
         LOG(WARN, "Architecture name of the plan `",
@@ -779,7 +777,7 @@ void Executor::Impl::barrier() {
 uintptr_t Executor::Impl::tensor_address(const Tensor tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
     if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
-        ERR(NotFoundError, "Invalid buffer ID: ", buffer_id);
+        ERR(InternalError, "Invalid buffer ID: ", buffer_id);
     }
     size_t offset = buffer_id_to_offset_.at(buffer_id);
     return reinterpret_cast<uintptr_t>(buffer_->ref(offset));

From 762bf4aa439510dbc04e4f9ee83da84c7a32a03a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:30:57 +0000
Subject: [PATCH 046/106] fix merge

---
 ark/ops/ops_all_reduce_test.cpp    | 15 +++++++--------
 ark/ops/ops_communication_test.cpp |  2 +-
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/ark/ops/ops_all_reduce_test.cpp b/ark/ops/ops_all_reduce_test.cpp
index 90814d036..8cf68b085 100644
--- a/ark/ops/ops_all_reduce_test.cpp
+++ b/ark/ops/ops_all_reduce_test.cpp
@@ -125,10 +125,9 @@ void test_all_reduce_packet_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result =
-                ark::op_test("all_reduce_packet", m, {ones}, {output},
-                             baseline_all_reduce<ark::half_t, NumGpus>,
-                             {ones_vec.data()}, false, gpu_id, NumGpus);
+            auto result = ark::op_test(
+                "all_reduce_packet", m, {ones}, {output},
+                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
@@ -232,10 +231,10 @@ void test_all_reduce_sm_internal(ark::DimType nelem) {
 
             std::vector<ark::half_t> ones_vec(ones.shape().nelems(),
                                               ark::half_t(1.0f));
-            auto result = ark::op_test(
-                "all_reduce_sm", m, {ones}, {output},
-                baseline_all_reduce<ark::half_t, NumGpus>, {ones_vec.data()},
-                false, gpu_id, NumGpus, config_rule);
+            auto result =
+                ark::op_test("all_reduce_sm", m, {ones}, {output},
+                             baseline_all_reduce<ark::half_t, NumGpus>,
+                             {ones_vec.data()}, {config_rule});
             UNITTEST_LOG(result);
             UNITTEST_EQ(result.max_diff[0], 0.0f);
             return ark::unittest::SUCCESS;
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index db384c1f4..8cdad41b2 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -433,7 +433,7 @@ ark::unittest::State test_communication_send_recv_reduce() {
 
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, 2, gpu_id, "Executor", planner.plan());
+            ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan());
             exe.compile();
 
             std::vector<ark::half_t> data(1024);

From f654f0b08d48931acd5645c16300c1a6f3ebe88e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:34:21 +0000
Subject: [PATCH 047/106] add a python method

---
 python/executor_py.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e782a99fe..a3f2a078b 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -63,6 +63,7 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
+        .def("tensor_address", &ark::Executor::tensor_address)
         .def("tensor_read",
              py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
                                uintptr_t>(&tensor_read),

From 498926c6242a35a38ffd6a8c406b4f3cf1ff84c6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 16:35:28 +0000
Subject: [PATCH 048/106] submodule update

---
 third_party/mscclpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mscclpp b/third_party/mscclpp
index cddffbc8b..40cb19655 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3
+Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829

From 3e331a2e2f5487502daccc32890ef49c5d86eb12 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 17:12:15 +0000
Subject: [PATCH 049/106] fix

---
 ark/model/model_json.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index b82f9e484..c2099e2c9 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -287,6 +287,7 @@ PlanJson::PlanJson(const Json &json)
     : Json((json != nullptr) ? json
                              : Json{{"Rank", 0},
                                     {"WorldSize", 1},
+                                    {"Architecture", "ANY"},
                                     {"NumProcessors", 1},
                                     {"NumWarpsPerProcessor", 1},
                                     {"TaskInfos", Json::array()},

From 10bfa75dbd40a96ffca69fb22e89127e1839b940 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 17:14:47 +0000
Subject: [PATCH 050/106] Rename CMake environments

---
 .github/workflows/codeql.yml  |  4 ++--
 .github/workflows/ut-cuda.yml |  2 +-
 CMakeLists.txt                | 32 ++++++++++++++++----------------
 ark/CMakeLists.txt            | 10 +++++-----
 pyproject.toml                |  2 +-
 third_party/CMakeLists.txt    |  9 +++++++--
 third_party/mscclpp           |  2 +-
 7 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 0d7094c36..272cb8ebe 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON -DBUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON -DBYPASS_GPU_CHECK=ON -DUSE_ROCM=ON -DBUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 4e573adfb..c2e8e7c50 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
           make -j ut ark_py
 
       - name: Run C++ UT
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ee1e3566e..2e80ea1e8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,16 +13,16 @@ enable_language(CXX)
 
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
-option(USE_CUDA "Use NVIDIA/CUDA." OFF)
-option(USE_ROCM "Use AMD/ROCm." OFF)
-option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
-option(BUILD_TESTS "Build unit tests." ON)
+option(ARK_USE_CUDA "Use NVIDIA/CUDA." OFF)
+option(ARK_USE_ROCM "Use AMD/ROCm." OFF)
+option(ARK_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
+option(ARK_BUILD_TESTS "Build unit tests." ON)
 
-if(BYPASS_GPU_CHECK)
-    if(USE_CUDA)
+if(ARK_BYPASS_GPU_CHECK)
+    if(ARK_USE_CUDA)
         message("Bypassing GPU check: using NVIDIA/CUDA.")
         find_package(CUDAToolkit REQUIRED)
-    elseif(USE_ROCM)
+    elseif(ARK_USE_ROCM)
         message("Bypassing GPU check: using AMD/ROCm.")
         set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
         find_package(hip REQUIRED)
@@ -35,16 +35,16 @@ else()
     include(CheckAmdGpu)
     if(NVIDIA_FOUND AND AMD_FOUND)
         message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.")
-        set(USE_CUDA ON)
-        set(USE_ROCM OFF)
+        set(ARK_USE_CUDA ON)
+        set(ARK_USE_ROCM OFF)
     elseif(NVIDIA_FOUND)
         message("Detected NVIDIA/CUDA.")
-        set(USE_CUDA ON)
-        set(USE_ROCM OFF)
+        set(ARK_USE_CUDA ON)
+        set(ARK_USE_ROCM OFF)
     elseif(AMD_FOUND)
         message("Detected AMD/ROCm.")
-        set(USE_CUDA OFF)
-        set(USE_ROCM ON)
+        set(ARK_USE_CUDA OFF)
+        set(ARK_USE_ROCM ON)
     else()
         message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.")
     endif()
@@ -53,7 +53,7 @@ endif()
 # Declare project
 set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -Wno-deprecated-declarations")
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     set(CMAKE_CUDA_STANDARD 17)
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
     project(ark LANGUAGES CXX CUDA)
@@ -72,7 +72,7 @@ if(USE_CUDA)
     if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
         set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90)
     endif()
-else()  # USE_ROCM
+else()  # ARK_USE_ROCM
     set(CMAKE_HIP_STANDARD 17)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
     project(ark LANGUAGES CXX HIP)
@@ -145,7 +145,7 @@ add_custom_target(ut)
 # Details
 add_subdirectory(ark)
 
-if(BUILD_PYTHON)
+if(ARK_BUILD_PYTHON)
     # Install Python module
     add_subdirectory(python)
     add_dependencies(ark_py build)
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 4457d3c0b..208d9f9cb 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -6,7 +6,7 @@ file(GLOB_RECURSE UT_SOURCES CONFIGURE_DEPENDS *_test.cpp)
 file(GLOB_RECURSE UT_COMMON_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/unittest/*.cpp)
 list(REMOVE_ITEM SOURCES ${UT_SOURCES} ${UT_COMMON_SOURCES})
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     file(GLOB_RECURSE CU_SOURCES CONFIGURE_DEPENDS *.cu)
     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
 endif()
@@ -23,7 +23,7 @@ target_include_directories(ark_obj SYSTEM PRIVATE
     ${NUMA_INCLUDE_DIRS}
 )
 
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     list(APPEND COMMON_LIBS CUDA::cuda_driver)
     target_include_directories(ark_obj SYSTEM PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -32,7 +32,7 @@ if(USE_CUDA)
     target_compile_definitions(ark_obj PUBLIC ARK_CUDA)
 endif()
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     list(APPEND COMMON_LIBS hip::host)
     target_include_directories(ark_obj SYSTEM PRIVATE
         ${PROJECT_SOURCE_DIR}/third_party/cutlass/include
@@ -45,7 +45,7 @@ target_sources(ark_obj PRIVATE ${SOURCES})
 target_link_libraries(ark_obj PUBLIC mscclpp_static PRIVATE ${COMMON_LIBS})
 
 # ARK unit tests
-if(BUILD_TESTS)
+if(ARK_BUILD_TESTS)
     foreach(ut_source IN ITEMS ${UT_SOURCES})
         get_filename_component(exe_name ${ut_source} NAME_WE)
         add_executable(${exe_name} ${ut_source} ${UT_COMMON_SOURCES})
@@ -58,7 +58,7 @@ if(BUILD_TESTS)
             ${NUMA_INCLUDE_DIRS}
         )
 
-        if(USE_CUDA)
+        if(ARK_USE_CUDA)
             target_link_libraries(${exe_name} PRIVATE ark_obj ${COMMON_LIBS} CUDA::cudart CUDA::cublas)
             target_include_directories(${exe_name} SYSTEM PRIVATE
                 ${CUDAToolkit_INCLUDE_DIRS}
diff --git a/pyproject.toml b/pyproject.toml
index 1f9386c73..d9fb4502e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ install.strip = true
 build-dir = "build/{wheel_tag}"
 
 [tool.scikit-build.cmake.define]
-BUILD_PYTHON = "ON"
+ARK_BUILD_PYTHON = "ON"
 
 [tool.black]
 line-length = 80
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 12ae74298..96e442289 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -14,7 +14,12 @@ FetchContent_Declare(
     GIT_TAG v0.5.2
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
 )
+set(BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
+set(BUILD_APPS_NCCL OFF CACHE BOOL "" FORCE)
+set(USE_CUDA ${ARK_USE_CUDA} CACHE BOOL "" FORCE)
+set(USE_ROCM ${ARK_USE_ROCM} CACHE BOOL "" FORCE)
+set(BYPASS_GPU_CHECK ON CACHE BOOL "" FORCE)
 set(INSTALL_PREFIX "ark")
 FetchContent_GetProperties(mscclpp)
 if (NOT mscclpp_POPULATED)
@@ -35,7 +40,7 @@ if (NOT json_POPULATED)
 endif()
 set(JSON_INCLUDE_DIRS ${json_SOURCE_DIR}/include PARENT_SCOPE)
 
-if(USE_CUDA)
+if(ARK_USE_CUDA)
     # Configure CUTLASS
     FetchContent_Declare(
         cutlass
@@ -58,7 +63,7 @@ if(USE_CUDA)
 endif()
 
 
-if(USE_ROCM)
+if(ARK_USE_ROCM)
     # Configure CK
     FetchContent_Declare(
         ck
diff --git a/third_party/mscclpp b/third_party/mscclpp
index cddffbc8b..40cb19655 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit cddffbc8b6dfa6facf7c64c1b7d73acf30e600b3
+Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829

From 3dda44a8dc310560333de0cf9090d7da0013e21f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 18:15:09 +0000
Subject: [PATCH 051/106] A few fixes & improved coverage

---
 ark/api/executor.cpp         |  21 +++--
 ark/api/executor_test.cpp    | 150 +++++++++++++++++++++++++++++++++++
 ark/include/ark/executor.hpp |   2 +-
 python/executor_py.cpp       |   2 +-
 4 files changed, 161 insertions(+), 14 deletions(-)
 create mode 100644 ark/api/executor_test.cpp

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 42ed45128..16d369bc8 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -141,7 +141,7 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 class Executor::Impl {
    public:
     Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
-    ~Impl() = default;
+    ~Impl();
 
     void init(const PlanJson& plan);
 
@@ -152,7 +152,7 @@ class Executor::Impl {
     std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
-    void launch(int64_t max_spin_count);
+    void launch();
     void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
@@ -219,6 +219,10 @@ Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
     }
 }
 
+Executor::Impl::~Impl() {
+    if (is_launched_) stop(-1);
+}
+
 void Executor::Impl::init(const PlanJson &plan_json) {
     plan_json_ = plan_json;
     rank_ = plan_json_["Rank"].get<int>();
@@ -620,13 +624,12 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
 
 void Executor::Impl::compile() { kernel_->compile(); }
 
-void Executor::Impl::launch(int64_t max_spin_count) {
+void Executor::Impl::launch() {
     if (!kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before initialization.");
     }
     if (is_launched_) {
-        // Wait until previous works finish.
-        this->wait(max_spin_count);
+        LOG(WARN, "Ignore launching twice.");
         return;
     }
     auto get_global_rt = [&](const std::string &symbol) {
@@ -674,12 +677,6 @@ void Executor::Impl::launch(int64_t max_spin_count) {
     }
 
     elapsed_msec_ = -1;
-    if (!kernel_->is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
-    } else if (is_launched_) {
-        LOG(WARN, "Ignore launching twice.");
-        return;
-    }
     timer_begin_->record(stream_raw_);
 
     if (world_size_ > 1) {
@@ -911,7 +908,7 @@ std::string Executor::plan() const { return impl_->plan(); }
 
 void Executor::compile() { impl_->compile(); }
 
-void Executor::launch(int64_t max_spin_count) { impl_->launch(max_spin_count); }
+void Executor::launch() { impl_->launch(); }
 
 void Executor::run(int iter) { impl_->run(iter); }
 
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
new file mode 100644
index 000000000..b0b398ac9
--- /dev/null
+++ b/ark/api/executor_test.cpp
@@ -0,0 +1,150 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+
+#include "gpu/gpu.hpp"
+#include "model/model_json.hpp"
+#include "unittest/unittest_utils.h"
+
+template <bool LoopMode>
+ark::unittest::State test_executor() {
+    ark::gpuStream stream;
+    UNITTEST_EQ(
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
+        ark::gpuSuccess);
+
+    ark::Model empty;
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_EQ(executor.device_id(), 0);
+        UNITTEST_EQ(executor.stream(), stream);
+
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+        executor.destroy();
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        executor.compile();
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.launch();
+        executor.run(1);
+        executor.wait();
+        executor.stop();
+
+        executor.destroy();
+    }
+    {
+        ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
+        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
+
+        executor.compile();
+        executor.launch();
+        executor.launch();  // Will be ignored with a warning.
+        executor.run(1);
+        executor.wait();
+        executor.wait();  // nothing to do
+
+        // Stop & destroy automatically.
+    }
+
+    UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_loop() { return test_executor<true>(); }
+
+ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }
+
+ark::unittest::State test_executor_tensor_read_write() {
+    // Alloc CPU array
+    std::vector<float> host_data(1024);
+    void *host_ptr = host_data.data();
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        host_data[i] = static_cast<float>(i);
+    }
+
+    // Alloc GPU array
+    void *dev_ptr;
+    UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, 1024 * sizeof(float)),
+                ark::gpuSuccess);
+
+    // Create an ARK tensor
+    ark::Model m;
+    auto tensor = m.tensor({1024}, ark::FP32);
+    m.noop(tensor);
+
+    ark::DefaultExecutor executor(m, 0);
+    executor.compile();
+    executor.launch();
+
+    // Copy data from CPU array to ARK tensor
+    executor.tensor_write(tensor, host_ptr, 1024 * sizeof(float));
+
+    // Copy data from ARK tensor to GPU array
+    executor.tensor_read(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true);
+
+    // Check the data
+    std::vector<float> dev_data(1024);
+    executor.tensor_read(tensor, dev_data.data(), 1024 * sizeof(float));
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    UNITTEST_EQ(ark::gpuMemcpy(dev_data.data(), dev_ptr, 1024 * sizeof(float),
+                               ark::gpuMemcpyDeviceToHost),
+                ark::gpuSuccess);
+    for (size_t i = 0; i < dev_data.size(); ++i) {
+        UNITTEST_EQ(dev_data[i], static_cast<float>(i));
+        dev_data[i] = -1;
+    }
+
+    // Copy -1s back to GPU array
+    UNITTEST_EQ(ark::gpuMemcpy(dev_ptr, dev_data.data(), 1024 * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
+
+    // Copy data from GPU array to ARK tensor
+    executor.tensor_write(tensor, dev_ptr, 1024 * sizeof(float), nullptr, true);
+
+    // Copy data from ARK tensor to CPU array
+    executor.tensor_read(tensor, host_ptr, 1024 * sizeof(float));
+
+    // Check the data
+    for (size_t i = 0; i < host_data.size(); ++i) {
+        UNITTEST_EQ(host_data[i], -1);
+    }
+
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_executor_invalid() {
+    // Invalid device ID.
+    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
+                   ark::InvalidUsageError);
+
+    // Invalid rank.
+    ark::PlanJson plan;
+    plan["Rank"] = 1;
+    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
+                   ark::InvalidUsageError);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    UNITTEST(test_executor_loop);
+    UNITTEST(test_executor_no_loop);
+    UNITTEST(test_executor_tensor_read_write);
+    UNITTEST(test_executor_invalid);
+    return 0;
+}
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 3744c33db..7f30f39ed 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -39,7 +39,7 @@ class Executor {
 
     /// Launch the model (not running yet). This must be called after
     /// `compile()`.
-    void launch(int64_t max_spin_count = -1);
+    void launch();
 
     /// Run the model for `iter` iterations.
     void run(int iter);
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index a3f2a078b..36e1c435e 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -56,7 +56,7 @@ void register_executor(py::module &m) {
              })
         .def("plan", &ark::Executor::plan)
         .def("compile", &ark::Executor::compile)
-        .def("launch", &ark::Executor::launch, py::arg("max_spin_count") = -1)
+        .def("launch", &ark::Executor::launch)
         .def("run", &ark::Executor::run, py::arg("iter"))
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)

From 4971601b09880e29adc85ab305a739edf55ccbb0 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 19:03:08 +0000
Subject: [PATCH 052/106] fix merge

---
 ark/api/context_manager.cpp                | 42 ----------
 ark/api/context_manager_test.cpp           | 53 ------------
 ark/api/executor.cpp                       |  8 --
 ark/api/model.cpp                          |  2 +-
 ark/api/model_graph.cpp                    |  4 +-
 ark/api/plan_manager.cpp                   | 97 ----------------------
 ark/api/plan_manager_test.cpp              | 58 -------------
 ark/codegen.cpp                            |  1 -
 ark/include/ark/context_manager.hpp        | 24 ------
 ark/include/ark/error.hpp                  | 15 +++-
 ark/include/ark/model.hpp                  | 57 +++++--------
 ark/include/ark/model_graph.hpp            |  2 +-
 ark/include/ark/plan_manager.hpp           | 25 ------
 ark/model/model_graph_impl.cpp             |  6 +-
 ark/model/model_graph_impl.hpp             |  8 +-
 ark/model/model_op.cpp                     | 11 ---
 ark/model/model_op.hpp                     |  9 +-
 ark/ops/ops_arithmetic.cpp                 | 20 ++---
 ark/ops/ops_arithmetic_test.cpp            | 48 ++++-------
 ark/ops/ops_cast.cpp                       | 10 +--
 ark/ops/ops_communication.cpp              | 14 ++--
 ark/ops/ops_copy.cpp                       |  5 +-
 ark/ops/ops_embedding.cpp                  |  4 +-
 ark/ops/ops_identity.cpp                   |  2 +-
 ark/ops/ops_math.cpp                       | 31 +++----
 ark/ops/ops_matmul.cpp                     |  6 +-
 ark/ops/ops_noop.cpp                       |  2 +-
 ark/ops/ops_reduce.cpp                     | 12 +--
 ark/ops/ops_refer.cpp                      |  2 +-
 ark/ops/ops_reshape.cpp                    |  4 +-
 ark/ops/ops_rope.cpp                       |  5 +-
 ark/ops/ops_scalar.cpp                     | 31 +++----
 ark/ops/ops_transpose.cpp                  |  5 +-
 examples/llama/model_7b_b1_s2048.py        | 70 ++++++++--------
 examples/tutorial/plan_manager_tutorial.py | 81 ------------------
 python/ark/plan_manager.py                 | 34 --------
 python/ark/runtime.py                      |  1 +
 python/model_py.cpp                        | 79 ++++++++----------
 python/plan_manager_py.cpp                 | 15 ----
 39 files changed, 195 insertions(+), 708 deletions(-)
 delete mode 100644 ark/api/context_manager.cpp
 delete mode 100644 ark/api/context_manager_test.cpp
 delete mode 100644 ark/api/plan_manager.cpp
 delete mode 100644 ark/api/plan_manager_test.cpp
 delete mode 100644 ark/include/ark/context_manager.hpp
 delete mode 100644 ark/include/ark/plan_manager.hpp
 delete mode 100644 examples/tutorial/plan_manager_tutorial.py
 delete mode 100644 python/ark/plan_manager.py
 delete mode 100644 python/plan_manager_py.cpp

diff --git a/ark/api/context_manager.cpp b/ark/api/context_manager.cpp
deleted file mode 100644
index 6d16d9e79..000000000
--- a/ark/api/context_manager.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/context_manager.hpp"
-
-#include "model/model_graph_impl.hpp"
-
-namespace ark {
-
-class ContextManager::Impl {
-   public:
-    Impl(std::shared_ptr<ModelGraphContextStack> context_stack,
-         const std::map<std::string, std::string>& context_map);
-
-    ~Impl();
-
-   private:
-    std::shared_ptr<ModelGraphContextStack> context_stack_;
-    std::vector<std::string> keys_;
-};
-
-ContextManager::Impl::Impl(
-    std::shared_ptr<ModelGraphContextStack> context_stack,
-    const std::map<std::string, std::string>& context_map)
-    : context_stack_(context_stack) {
-    for (const auto& [key, value] : context_map) {
-        context_stack_->push(key, value);
-        keys_.push_back(key);
-    }
-}
-
-ContextManager::Impl::~Impl() {
-    for (auto it = keys_.rbegin(); it != keys_.rend(); ++it) {
-        context_stack_->pop(*it);
-    }
-}
-
-ContextManager::ContextManager(
-    Model& model, const std::map<std::string, std::string>& context_map)
-    : impl_(std::make_shared<Impl>(model.impl_->context_stack_, context_map)) {}
-
-}  // namespace ark
diff --git a/ark/api/context_manager_test.cpp b/ark/api/context_manager_test.cpp
deleted file mode 100644
index 5fff94f34..000000000
--- a/ark/api/context_manager_test.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/context_manager.hpp"
-
-#include "model/model_node.hpp"
-#include "unittest/unittest_utils.h"
-
-ark::unittest::State test_context_manager() {
-    ark::Model model;
-    ark::Tensor t0 = model.tensor({1}, ark::FP32);
-    ark::Tensor t1 = model.tensor({1}, ark::FP32);
-    ark::Tensor t2 = model.add(t0, t1);
-
-    ark::Tensor t3;
-    ark::Tensor t4;
-    ark::Tensor t5;
-    {
-        ark::ContextManager cm0_1(model, {{"key0", "val1"}});
-        t3 = model.relu(t2);
-
-        ark::ContextManager cm1_1(model, {{"key1", "val2"}});
-        t4 = model.sqrt(t3);
-    }
-    {
-        ark::ContextManager cm0_2(model, {{"key0", "val3"}});
-        t5 = model.exp(t2);
-    }
-
-    UNITTEST_TRUE(model.verify());
-
-    auto compressed = model.compress(false);
-    UNITTEST_TRUE(compressed.verify());
-
-    auto nodes = compressed.nodes();
-    UNITTEST_EQ(nodes.size(), 4);
-
-    UNITTEST_EQ(nodes[0]->context.size(), 0);
-    UNITTEST_EQ(nodes[1]->context.size(), 1);
-    UNITTEST_EQ(nodes[1]->context.at("key0"), "val1");
-    UNITTEST_EQ(nodes[2]->context.size(), 2);
-    UNITTEST_EQ(nodes[2]->context.at("key0"), "val1");
-    UNITTEST_EQ(nodes[2]->context.at("key1"), "val2");
-    UNITTEST_EQ(nodes[3]->context.size(), 1);
-    UNITTEST_EQ(nodes[3]->context.at("key0"), "val3");
-
-    return ark::unittest::SUCCESS;
-}
-
-int main() {
-    UNITTEST(test_context_manager);
-    return 0;
-}
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 6fb2b5f2e..17d579763 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -250,14 +250,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
             gpu_manager->info().arch->name(), "`.");
     }
 
-    if (!gpu_manager->info().arch->belongs_to(
-            Arch::from_name(plan_json_.at("Architecture")))) {
-        LOG(WARN, "Architecture name of the plan `",
-            plan_json_.at("Architecture").get<std::string>(),
-            "` is not compatible with the GPU architecture `",
-            gpu_manager->info().arch->name(), "`.");
-    }
-
     buffer_id_to_offset_ = init_buffers(plan_json_);
 
     std::string buffer_id_to_offset_str;
diff --git a/ark/api/model.cpp b/ark/api/model.cpp
index 8227ea848..dcbd4940e 100644
--- a/ark/api/model.cpp
+++ b/ark/api/model.cpp
@@ -20,7 +20,7 @@ size_t Model::id() const { return id_; }
 
 Model Model::compress() const {
     Model model(*this);
-    model.compress_nodes(merge_nodes);
+    model.compress_nodes();
     return model;
 }
 
diff --git a/ark/api/model_graph.cpp b/ark/api/model_graph.cpp
index a4477b8e6..e07565141 100644
--- a/ark/api/model_graph.cpp
+++ b/ark/api/model_graph.cpp
@@ -33,9 +33,7 @@ int ModelGraph::rank() const { return impl_->rank(); }
 
 int ModelGraph::world_size() const { return impl_->world_size(); }
 
-void ModelGraph::compress_nodes(bool merge_nodes) {
-    impl_->compress_nodes(merge_nodes);
-}
+void ModelGraph::compress_nodes() { impl_->compress_nodes(); }
 
 bool ModelGraph::compressed() const { return impl_->compressed(); }
 
diff --git a/ark/api/plan_manager.cpp b/ark/api/plan_manager.cpp
deleted file mode 100644
index 8cb1940b1..000000000
--- a/ark/api/plan_manager.cpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/plan_manager.hpp"
-
-#include "logging.h"
-#include "model/model_json.hpp"
-#include "model/model_graph_impl.hpp"
-
-namespace ark {
-
-class PlanManagerState {
-   public:
-    PlanManagerState() : sync(true) {}
-    bool sync;
-};
-
-static std::map<size_t, PlanManagerState> gPlanManagerStates;
-
-PlanManager::PlanManager(Model& model, const std::string& plan_context)
-    : model_id_(model.id()), stop_sync_(false) {
-    static int task_group_id = 0;
-    auto ctx = Json::parse(plan_context);
-    if (!ctx.is_object()) {
-        ERR(ModelError, "plan context must be a JSON object");
-    }
-    if (gPlanManagerStates.find(model_id_) == gPlanManagerStates.end()) {
-        gPlanManagerStates.emplace(model_id_, PlanManagerState());
-    }
-    auto& state = gPlanManagerStates[model_id_];
-    bool async = !state.sync;
-    std::map<std::string, std::string> context_map;
-    for (const auto& [key, value] : ctx.items()) {
-        if (key == "sync") {
-            if (!value.is_boolean()) {
-                ERR(ModelError, "sync must be a boolean");
-            }
-            if (state.sync && !value.get<bool>()) {
-                stop_sync_ = true;
-                state.sync = false;
-                context_map["TaskGroupId"] = std::to_string(task_group_id++);
-            }
-        } else if (key == "processor_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "processor_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring processor_range under sync=false context");
-                continue;
-            }
-            context_map["ProcessorRange"] = value.dump();
-        } else if (key == "warp_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "warp_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring warp_range under sync=false context");
-                continue;
-            }
-            context_map["WarpRange"] = value.dump();
-        } else if (key == "sram_range") {
-            if (!value.is_array()) {
-                ERR(ModelError, "sram_range must be an array");
-            }
-            if (async) {
-                LOG(WARN, "Ignoring sram_range under sync=false context");
-                continue;
-            }
-            context_map["SramRange"] = value.dump();
-        } else if (key == "config") {
-            if (!value.is_object()) {
-                ERR(ModelError, "config must be an object");
-            }
-            auto cfg = model.impl_->get_context("Config");
-            if (cfg.empty()) {
-                context_map["Config"] = value.dump();
-            } else {
-                auto cfg_obj = Json::parse(cfg);
-                for (const auto& [k, v] : value.items()) {
-                    cfg_obj[k] = v;
-                }
-                context_map["Config"] = cfg_obj.dump();
-            }
-        } else {
-            LOG(WARN, "Ignoring unknown plan context key: ", key);
-        }
-    }
-    context_manager_ = std::make_shared<ContextManager>(model, context_map);
-}
-
-PlanManager::~PlanManager() {
-    if (stop_sync_) {
-        gPlanManagerStates[model_id_].sync = true;
-    }
-}
-
-}  // namespace ark
diff --git a/ark/api/plan_manager_test.cpp b/ark/api/plan_manager_test.cpp
deleted file mode 100644
index 78f5d4cb8..000000000
--- a/ark/api/plan_manager_test.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "ark/plan_manager.hpp"
-#include "ark/planner.hpp"
-
-#include "model/model_json.hpp"
-#include "unittest/unittest_utils.h"
-
-ark::unittest::State test_plan_manager() {
-    ark::Model model;
-    ark::Tensor t0 = model.tensor({1}, ark::FP32);
-    ark::Tensor t1 = model.tensor({1}, ark::FP32);
-    ark::Tensor t2 = model.add(t0, t1);
-
-    ark::Tensor t3;
-    ark::Tensor t4;
-    ark::Tensor t5;
-    ark::Tensor t6;
-    {
-        ark::PlanManager pm_0(model, ark::Json({
-            {"processor_range", {0, 2}},
-            {"warp_range", {0, 4}},
-            {"sram_range", {0, 0}},
-            {"sync", false}
-        }).dump());
-        t3 = model.relu(t2);
-        t4 = model.sqrt(t3);
-    }
-    {
-        ark::PlanManager pm_0(model, ark::Json({
-            {"processor_range", {2, 4}},
-            {"warp_range", {0, 4}},
-            {"sram_range", {0, 0}}
-        }).dump());
-        t5 = model.exp(t2);
-
-        ark::PlanManager pm_1(model, ark::Json({
-            {"processor_range", {2, 3}}
-        }).dump());
-        t6 = model.rsqrt(t5);
-    }
-
-    UNITTEST_TRUE(model.verify());
-
-    ark::DefaultPlanner planner(model, 0);
-    auto plan_str = planner.plan();
-    ark::Json plan = ark::Json::parse(plan_str);
-
-    UNITTEST_LOG(plan_str);
-
-    return ark::unittest::SUCCESS;
-}
-
-int main() {
-    UNITTEST(test_plan_manager);
-    return 0;
-}
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index bc43584cb..1619b863f 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -87,7 +87,6 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     num_warps_per_proc_ = plan.at("NumWarpsPerProcessor");
 
     std::stringstream definitions_ss;
-
     for (auto &task_json : plan.at("TaskInfos")) {
         definitions_ss << this->def_task(task_json);
     }
diff --git a/ark/include/ark/context_manager.hpp b/ark/include/ark/context_manager.hpp
deleted file mode 100644
index 58271ea8c..000000000
--- a/ark/include/ark/context_manager.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_CONTEXT_MANAGER_HPP
-#define ARK_CONTEXT_MANAGER_HPP
-
-#include <ark/model.hpp>
-#include <map>
-
-namespace ark {
-
-class ContextManager {
-   public:
-    ContextManager(Model& model,
-                   const std::map<std::string, std::string>& context_map);
-
-   private:
-    class Impl;
-    std::shared_ptr<Impl> impl_;
-};
-
-}  // namespace ark
-
-#endif  // ARK_CONTEXT_MANAGER_HPP
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 78d02cab3..965b1c0bc 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -9,6 +9,7 @@
 
 namespace ark {
 
+/// Base class for all ARK errors.
 class BaseError : public std::exception {
    private:
     std::string msg_;
@@ -24,15 +25,21 @@ class BaseError : public std::exception {
         _name(const std::string &msg) : BaseError(msg) {} \
     };
 
+/// Internal error in ARK, likely a bug.
 REGISTER_ERROR_TYPE(InternalError)
+/// Invalid usage of ARK API.
 REGISTER_ERROR_TYPE(InvalidUsageError)
-REGISTER_ERROR_TYPE(NotFoundError)
+/// Invalid ARK model definition or usage.
 REGISTER_ERROR_TYPE(ModelError)
-REGISTER_ERROR_TYPE(SchedulerError)
-REGISTER_ERROR_TYPE(ExecutorError)
+/// Invalid ARK plan definition or usage.
+REGISTER_ERROR_TYPE(PlanError)
+/// Unsupported feature triggered.
+REGISTER_ERROR_TYPE(UnsupportedError)
+/// Error from invalid system state such as a system call failure.
 REGISTER_ERROR_TYPE(SystemError)
+/// Error from a CUDA/HIP API call.
 REGISTER_ERROR_TYPE(GpuError)
-REGISTER_ERROR_TYPE(RuntimeError)
+/// Error from a unit test.
 REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index cbbff7f95..3c4f22e22 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -103,29 +103,23 @@ class Model : public ModelGraph {
     // result in `output`.
     // Currently, only reduction along the last dimension is supported.
     Tensor reduce_sum(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor,
-                      const std::string &config = "",
-                      const std::string &name = "");
+                      Tensor output = NullTensor, const std::string &name = "");
     Tensor reduce_mean(Tensor input, int axis, bool keepdims = true,
                        Tensor output = NullTensor,
-                       const std::string &config = "",
                        const std::string &name = "");
     Tensor reduce_max(Tensor input, int axis, bool keepdims = true,
-                      Tensor output = NullTensor,
-                      const std::string &config = "",
-                      const std::string &name = "");
+                      Tensor output = NullTensor, const std::string &name = "");
 
     // Transposes the `input` tensor according to the given `permutation`.
     // For example, transpose(input, {0, 1 ,3, 2}) will swap the last two
     // dimensions of the input tensor. Currently, only 4D tensors are supported.
     Tensor transpose(Tensor input, const std::vector<int64_t> &permutation,
-                     Tensor output = NullTensor, const std::string &config = "",
-                     const std::string &name = "");
+                     Tensor output = NullTensor, const std::string &name = "");
     // Performs matrix multiplication between the `input` tensor and another
     // `other` tensor, storing the result in `output`.
     Tensor matmul(Tensor input, Tensor other, Tensor output = NullTensor,
                   bool trans_input = false, bool trans_other = false,
-                  const std::string &config = "", const std::string &name = "");
+                  const std::string &name = "");
     // Implements the 'im2col' method for 2D convolution layers, which takes an
     // `input` tensor and reshapes it to a 2D matrix by extracting image patches
     // from the input tensor based on the provided parameters.
@@ -142,66 +136,63 @@ class Model : public ModelGraph {
                     Tensor output = NullTensor, const std::string &name = "");
     // Calculates the exponential of the `input` tensor, element-wise.
     Tensor exp(Tensor input, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Calculates the square root of the `input` tensor, element-wise.
     Tensor sqrt(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Calculates the reverse square root of the `input` tensor, element-wise.
     Tensor rsqrt(Tensor input, Tensor output = NullTensor,
-                 const std::string &config = "", const std::string &name = "");
+                 const std::string &name = "");
     // ReLU activation
     Tensor relu(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Copy the `input` tensor to `output` tensor
     Tensor copy(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     Tensor copy(float val, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Applies the Gaussian Error Linear Unit (GELU) activation function to the
     // `input` tensor, element-wise. GELU is a smooth approximation of the
     // rectifier function and is widely used in deep learning models.
     Tensor gelu(Tensor input, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
     // Sigmoid activation
     Tensor sigmoid(Tensor input, Tensor output = NullTensor,
-                   const std::string &config = "",
                    const std::string &name = "");
     // Performs rotary position embedding (RoPE) on the `input` tensor
     Tensor rope(Tensor input, Tensor other, Tensor output = NullTensor,
-                const std::string &config = "", const std::string &name = "");
+                const std::string &name = "");
 
     // Performs an element-wise addition operator between the `input` tensor
     // and the `other` tensor
     Tensor add(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor add(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise subtraction operator between the `input` tensor
     // and the `other` tensor
     Tensor sub(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor sub(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise multiplication operator between the `input`
     // tensor and the `other` tensor,
     Tensor mul(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor mul(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     // Performs an element-wise division operator between the `input`
     // tensor and the `other` tensor,
     Tensor div(Tensor input, Tensor other, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
     Tensor div(Tensor input, float value, Tensor output = NullTensor,
-               const std::string &config = "", const std::string &name = "");
+               const std::string &name = "");
 
     Tensor send(Tensor input, int remote_rank, int tag,
-                Tensor output = NullTensor, const std::string &config = "",
-                const std::string &name = "");
+                Tensor output = NullTensor, const std::string &name = "");
     // Blocks the execution until the corresponding 'send' operator with the
     // specified `id` is completed.
-    Tensor send_done(Tensor input, const std::string &config = "",
-                     const std::string &name = "");
+    Tensor send_done(Tensor input, const std::string &name = "");
     // Receives a tensor from a source rank (@p src_rank), identified by the
     // `id` parameter. Blocks the execution until the corresponding 'recv'
     // operator is completed.
@@ -238,12 +229,10 @@ class Model : public ModelGraph {
                                    const std::string &name = "");
     /// Embedding layer.
     Tensor embedding(Tensor input, Tensor weight, Tensor output = NullTensor,
-                     const std::string &config = "",
                      const std::string &name = "");
     /// Tensor type casting.
     Tensor cast(Tensor input, const DataType &data_type,
-                Tensor output = NullTensor, const std::string &config = "",
-                const std::string &name = "");
+                Tensor output = NullTensor, const std::string &name = "");
 
     // sync across multi devices
     Tensor device_sync(Tensor input, int rank, int rank_num,
diff --git a/ark/include/ark/model_graph.hpp b/ark/include/ark/model_graph.hpp
index 598bf343a..29074630c 100644
--- a/ark/include/ark/model_graph.hpp
+++ b/ark/include/ark/model_graph.hpp
@@ -25,7 +25,7 @@ class ModelGraph {
 
     int world_size() const;
 
-    void compress_nodes(bool merge_nodes = false);
+    void compress_nodes();
 
     bool compressed() const;
 
diff --git a/ark/include/ark/plan_manager.hpp b/ark/include/ark/plan_manager.hpp
deleted file mode 100644
index 3952a1c06..000000000
--- a/ark/include/ark/plan_manager.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_PLAN_MANAGER_HPP
-#define ARK_PLAN_MANAGER_HPP
-
-#include <ark/context_manager.hpp>
-
-namespace ark {
-
-class PlanManager {
-   public:
-    PlanManager(Model& model, const std::string& plan_context);
-
-    ~PlanManager();
-
-   private:
-    size_t model_id_;
-    bool stop_sync_;
-    std::shared_ptr<ContextManager> context_manager_;
-};
-
-}  // namespace ark
-
-#endif  // ARK_PLAN_MANAGER_HPP
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 81359439a..7c1ea3fb5 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -112,7 +112,7 @@ ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
     return *this;
 }
 
-void ModelGraph::Impl::compress_nodes(bool merge_nodes) {
+void ModelGraph::Impl::compress_nodes() {
     if (!compressed_) {
         this->recursive_remove_virtual_nodes();
         compressed_ = true;
@@ -178,10 +178,6 @@ bool ModelGraph::Impl::verify() const {
     return true;
 }
 
-std::string ModelGraph::Impl::get_context(const std::string &key) const {
-    return context_stack_->get_context(key);
-}
-
 ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
     for (auto &tns : op->input_tensors()) {
         if (tensor_to_producer_op_.find(tns) == tensor_to_producer_op_.end()) {
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index c7080ab73..62944f999 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -54,8 +54,7 @@ class ModelGraph::Impl {
     Impl &operator=(const Impl &other);
 
     template <typename T, typename... Args>
-    ModelOpRef create_op(const std::string &config, const std::string &name,
-                         Args &&...args) {
+    ModelOpRef create_op(const std::string &name, Args &&... args) {
         ModelOpRef op = std::make_shared<T>(std::forward<Args>(args)...);
         std::string name_copy;
         if (name.empty()) {
@@ -68,7 +67,6 @@ class ModelGraph::Impl {
         if (count > 0) {
             name_copy += "_" + std::to_string(count);
         }
-        op->set_config(config);
         op->set_name(name_copy);
         add_op(op);
         return op;
@@ -78,14 +76,12 @@ class ModelGraph::Impl {
 
     int world_size() const { return world_size_; }
 
-    void compress_nodes(bool merge_nodes = false);
+    void compress_nodes();
 
     bool compressed() const { return compressed_; }
 
     bool verify() const;
 
-    std::string get_context(const std::string &key) const;
-
     std::string serialize(bool pretty = true) const;
 
     std::vector<ModelNodeRef> nodes() const;
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index dc4906235..5db8576e8 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -92,14 +92,6 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
     return it->second;
 }
 
-void ModelOp::set_config(const std::string &config) {
-    if (!config.empty()) {
-        config_ = Json::parse(config);
-    } else {
-        config_.clear();
-    }
-}
-
 std::vector<ModelTensorRef> ModelOp::input_tensors() const {
     // input_tensors = read_tensors || write_tensors
     std::set<ModelTensorRef> input_tensors;
@@ -192,9 +184,6 @@ Json ModelOp::serialize() const {
     for (auto &arg : args_) {
         j["Args"][arg.first] = arg.second.serialize();
     }
-    if (!config_.empty()) {
-        j["Config"] = config_;
-    }
     return j;
 }
 
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
index d048375c2..f7323d6c0 100644
--- a/ark/model/model_op.hpp
+++ b/ark/model/model_op.hpp
@@ -50,8 +50,8 @@ class ModelOp {
         return "";
     }
 
-    virtual std::vector<ModelOpArg> impl_args(
-        [[maybe_unused]] const Json &config) const {
+    virtual std::vector<ModelOpArg> impl_args([
+        [maybe_unused]] const Json &config) const {
         return {};
     }
 
@@ -60,14 +60,10 @@ class ModelOp {
         return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}};
     }
 
-    void set_config(const std::string &config);
-
     void set_name(const std::string &name) { name_ = name; }
 
     ModelOpType type() const { return type_; }
 
-    const Json &config() const { return config_; }
-
     const std::string &name() const { return name_; }
 
     bool is_virtual() const { return is_virtual_; }
@@ -104,7 +100,6 @@ class ModelOp {
         const std::vector<std::string> &template_args = {});
 
     ModelOpType type_;
-    Json config_;
     std::string name_;
     bool is_virtual_;
     std::vector<ModelTensorRef> read_tensors_;
diff --git a/ark/ops/ops_arithmetic.cpp b/ark/ops/ops_arithmetic.cpp
index ef85b5d22..aeece0d77 100644
--- a/ark/ops/ops_arithmetic.cpp
+++ b/ark/ops/ops_arithmetic.cpp
@@ -12,10 +12,9 @@ ModelOpAdd::ModelOpAdd(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Add", input, other, output) {}
 
 Tensor Model::add(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpAdd>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpAdd>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -24,10 +23,9 @@ ModelOpMul::ModelOpMul(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Mul", input, other, output) {}
 
 Tensor Model::mul(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpMul>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpMul>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -36,10 +34,9 @@ ModelOpSub::ModelOpSub(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Sub", input, other, output) {}
 
 Tensor Model::sub(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpSub>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpSub>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
@@ -48,10 +45,9 @@ ModelOpDiv::ModelOpDiv(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Div", input, other, output) {}
 
 Tensor Model::div(Tensor input, Tensor other, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpDiv>(config, name, input.ref_, other.ref_,
-                                output.ref_)
+        ->create_op<ModelOpDiv>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp
index fd6a05b1a..772da3276 100644
--- a/ark/ops/ops_arithmetic_test.cpp
+++ b/ark/ops/ops_arithmetic_test.cpp
@@ -2,7 +2,6 @@
 // Licensed under the MIT license.
 
 #include "ops_test_common.hpp"
-#include "model/model_json.hpp"
 
 template <typename T>
 void baseline_add(std::vector<void *> &outputs,
@@ -143,25 +142,12 @@ ark::unittest::State test_add_fp32() {
 
 ark::unittest::State test_add_fp16() {
     ark::Model m;
-    ark::Tensor t0 = m.tensor({32, 2048, 2048}, ark::FP16);
-    ark::Tensor t1 = m.tensor({32, 2048, 2048}, ark::FP16);
+    ark::Tensor t0 = m.tensor({8192}, ark::FP16);
+    ark::Tensor t1 = m.tensor({8192}, ark::FP16);
     ark::Tensor out = m.add(t0, t1);
 
     auto result =
-        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>, {},
-        {
-            ark::DefaultPlanner::ConfigRule([](const std::string op_str, const std::string) {
-                auto op = ark::Json::parse(op_str);
-                ark::Json config;
-                if (op.at("Type") == "Add") {
-                    config["NumWarps"] = 4;
-                    config["SramBytes"] = 0;
-                    config["Tile"] = {128, 256};
-                    config["NumTasks"] = 4096;
-                }
-                return config.dump();
-            })
-        });
+        ark::op_test("add_fp16", m, {t0, t1}, {out}, baseline_add<ark::half_t>);
     UNITTEST_LOG(result);
     UNITTEST_EQ(result.max_diff[0], 0.0f);
     return ark::unittest::SUCCESS;
@@ -430,20 +416,20 @@ ark::unittest::State test_div_invalid() {
 
 int main() {
     ark::init();
-    // UNITTEST(test_add_fp32);
+    UNITTEST(test_add_fp32);
     UNITTEST(test_add_fp16);
-    // UNITTEST(test_add_bf16);
-    // UNITTEST(test_add_overwrite);
-    // UNITTEST(test_add_broadcast);
-    // UNITTEST(test_add_invalid);
-    // UNITTEST(test_sub_fp32);
-    // UNITTEST(test_sub_invalid);
-    // UNITTEST(test_mul_fp32);
-    // UNITTEST(test_mul_fp16);
-    // UNITTEST(test_mul_overwrite);
-    // UNITTEST(test_mul_broadcast);
-    // UNITTEST(test_mul_invalid);
-    // UNITTEST(test_div_fp32);
-    // UNITTEST(test_div_invalid);
+    UNITTEST(test_add_bf16);
+    UNITTEST(test_add_overwrite);
+    UNITTEST(test_add_broadcast);
+    UNITTEST(test_add_invalid);
+    UNITTEST(test_sub_fp32);
+    UNITTEST(test_sub_invalid);
+    UNITTEST(test_mul_fp32);
+    UNITTEST(test_mul_fp16);
+    UNITTEST(test_mul_overwrite);
+    UNITTEST(test_mul_broadcast);
+    UNITTEST(test_mul_invalid);
+    UNITTEST(test_div_fp32);
+    UNITTEST(test_div_invalid);
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_cast.cpp b/ark/ops/ops_cast.cpp
index 96146217e..e94fec989 100644
--- a/ark/ops/ops_cast.cpp
+++ b/ark/ops/ops_cast.cpp
@@ -105,7 +105,7 @@ ModelOpByteCast::ModelOpByteCast(ModelTensorRef input, ModelDataType data_type,
 }
 
 Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     check_null(input.ref());
     if (output.is_null()) {
         if (input.data_type() == data_type) {
@@ -119,14 +119,14 @@ Tensor Model::cast(Tensor input, const DataType &data_type, Tensor output,
             byte_cast_helper(input.ref(), data_type.ref(), new_shape,
                              new_strides, new_offsets, new_padded_shape);
             return impl_
-                ->create_op<ModelOpByteCast>(
-                    config, name, input.ref(), data_type.ref(), new_shape,
-                    new_strides, new_offsets, new_padded_shape)
+                ->create_op<ModelOpByteCast>(name, input.ref(), data_type.ref(),
+                                             new_shape, new_strides,
+                                             new_offsets, new_padded_shape)
                 ->result_tensors()[0];
         }
     }
     return impl_
-        ->create_op<ModelOpCast>(config, name, input.ref(), data_type.ref(),
+        ->create_op<ModelOpCast>(name, input.ref(), data_type.ref(),
                                  output.ref())
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp
index e42c96d9c..baf7aafa2 100644
--- a/ark/ops/ops_communication.cpp
+++ b/ark/ops/ops_communication.cpp
@@ -589,25 +589,23 @@ Json ModelOpDeviceSync::default_config([[maybe_unused]] const ArchRef arch) cons
 }
 
 Tensor Model::send(Tensor input, int remote_rank, int tag, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     tags_.insert(tag);
     return impl_
-        ->create_op<ModelOpSend>(config, name, input.ref(), remote_rank, tag,
+        ->create_op<ModelOpSend>(name, input.ref(), remote_rank, tag,
                                  output.ref())
         ->result_tensors()[0];
 }
 
-Tensor Model::send_done(Tensor input, const std::string &config,
-                        const std::string &name) {
-    return impl_->create_op<ModelOpSendDone>(config, name, input.ref())
+Tensor Model::send_done(Tensor input, const std::string &name) {
+    return impl_->create_op<ModelOpSendDone>(name, input.ref())
         ->result_tensors()[0];
 }
 
 Tensor Model::recv(Tensor output, int remote_rank, int tag,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     tags_.insert(tag);
-    return impl_
-        ->create_op<ModelOpRecv>(config, name, output.ref(), remote_rank, tag)
+    return impl_->create_op<ModelOpRecv>(name, output.ref(), remote_rank, tag)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_copy.cpp b/ark/ops/ops_copy.cpp
index 4914c34a4..4f32966b8 100644
--- a/ark/ops/ops_copy.cpp
+++ b/ark/ops/ops_copy.cpp
@@ -20,9 +20,8 @@ ModelOpCopy::ModelOpCopy(ModelTensorRef input, ModelTensorRef output)
     verify();
 }
 
-Tensor Model::copy(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpCopy>(config, name, input.ref_, output.ref_)
+Tensor Model::copy(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpCopy>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp
index 1169c47c3..2e2626d4c 100644
--- a/ark/ops/ops_embedding.cpp
+++ b/ark/ops/ops_embedding.cpp
@@ -70,9 +70,9 @@ Json ModelOpEmbedding::default_config([
 }
 
 Tensor Model::embedding(Tensor input, Tensor weight, Tensor output,
-                        const std::string &config, const std::string &name) {
+                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpEmbedding>(config, name, input.ref_, weight.ref_,
+        ->create_op<ModelOpEmbedding>(name, input.ref_, weight.ref_,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_identity.cpp b/ark/ops/ops_identity.cpp
index dd398d8a5..065cd9a52 100644
--- a/ark/ops/ops_identity.cpp
+++ b/ark/ops/ops_identity.cpp
@@ -31,7 +31,7 @@ Tensor Model::identity(Tensor input, const std::vector<Tensor> &deps,
     for (auto &dep : deps) {
         deps_ref.emplace_back(dep.ref_);
     }
-    return impl_->create_op<ModelOpIdentity>("", name, input.ref_, deps_ref)
+    return impl_->create_op<ModelOpIdentity>(name, input.ref_, deps_ref)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_math.cpp b/ark/ops/ops_math.cpp
index b2833dcca..1067c561a 100644
--- a/ark/ops/ops_math.cpp
+++ b/ark/ops/ops_math.cpp
@@ -24,55 +24,48 @@ ModelOpMath::ModelOpMath(const std::string &type_name, ModelTensorRef input,
 ModelOpExp::ModelOpExp(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Exp", input, output) {}
 
-Tensor Model::exp(Tensor input, Tensor output, const std::string &config,
-                  const std::string &name) {
-    return impl_->create_op<ModelOpExp>(config, name, input.ref_, output.ref_)
+Tensor Model::exp(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpExp>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpGelu::ModelOpGelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Gelu", input, output) {}
 
-Tensor Model::gelu(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpGelu>(config, name, input.ref_, output.ref_)
+Tensor Model::gelu(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpGelu>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRelu::ModelOpRelu(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Relu", input, output) {}
 
-Tensor Model::relu(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpRelu>(config, name, input.ref_, output.ref_)
+Tensor Model::relu(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpRelu>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpRsqrt::ModelOpRsqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Rsqrt", input, output) {}
 
-Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &config,
-                    const std::string &name) {
-    return impl_->create_op<ModelOpRsqrt>(config, name, input.ref_, output.ref_)
+Tensor Model::rsqrt(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpRsqrt>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSigmoid::ModelOpSigmoid(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sigmoid", input, output) {}
 
-Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &config,
-                      const std::string &name) {
-    return impl_
-        ->create_op<ModelOpSigmoid>(config, name, input.ref_, output.ref_)
+Tensor Model::sigmoid(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpSigmoid>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
 ModelOpSqrt::ModelOpSqrt(ModelTensorRef input, ModelTensorRef output)
     : ModelOpMath("Sqrt", input, output) {}
 
-Tensor Model::sqrt(Tensor input, Tensor output, const std::string &config,
-                   const std::string &name) {
-    return impl_->create_op<ModelOpSqrt>(config, name, input.ref_, output.ref_)
+Tensor Model::sqrt(Tensor input, Tensor output, const std::string &name) {
+    return impl_->create_op<ModelOpSqrt>(name, input.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index bc94922fc..dca349f44 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -244,10 +244,10 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const {
 
 Tensor Model::matmul(Tensor input, Tensor other, Tensor output,
                      bool trans_input, bool trans_other,
-                     const std::string &config, const std::string &name) {
+                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpMatmul>(config, name, input.ref(), other.ref(),
-                                   output.ref(), trans_input, trans_other)
+        ->create_op<ModelOpMatmul>(name, input.ref(), other.ref(), output.ref(),
+                                   trans_input, trans_other)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_noop.cpp b/ark/ops/ops_noop.cpp
index 42fe5fdf5..894ab29be 100644
--- a/ark/ops/ops_noop.cpp
+++ b/ark/ops/ops_noop.cpp
@@ -30,7 +30,7 @@ Json ModelOpNoop::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 void Model::noop(Tensor input, const std::string &name) {
-    impl_->create_op<ModelOpNoop>("", name, input.ref_);
+    impl_->create_op<ModelOpNoop>(name, input.ref_);
 }
 
 }  // namespace ark
diff --git a/ark/ops/ops_reduce.cpp b/ark/ops/ops_reduce.cpp
index 19f70385b..78dd9d7e6 100644
--- a/ark/ops/ops_reduce.cpp
+++ b/ark/ops/ops_reduce.cpp
@@ -127,25 +127,25 @@ Json ModelOpReduce::default_config([[maybe_unused]] const ArchRef arch) const {
 }
 
 Tensor Model::reduce_max(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &config, const std::string &name) {
+                         const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMax>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMax>(name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_mean(Tensor input, int axis, bool keepdims, Tensor output,
-                          const std::string &config, const std::string &name) {
+                          const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceMean>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceMean>(name, input.ref_, axis, keepdims,
                                        output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::reduce_sum(Tensor input, int axis, bool keepdims, Tensor output,
-                         const std::string &config, const std::string &name) {
+                         const std::string &name) {
     return impl_
-        ->create_op<ModelOpReduceSum>(config, name, input.ref_, axis, keepdims,
+        ->create_op<ModelOpReduceSum>(name, input.ref_, axis, keepdims,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_refer.cpp b/ark/ops/ops_refer.cpp
index 68c61b30f..782d6708c 100644
--- a/ark/ops/ops_refer.cpp
+++ b/ark/ops/ops_refer.cpp
@@ -20,7 +20,7 @@ Tensor Model::refer(Tensor input, const Dims &shape, const Dims &strides,
                     const Dims &offsets, const Dims &padded_shape,
                     const std::string &name) {
     return impl_
-        ->create_op<ModelOpRefer>("", name, input.ref_, shape, strides, offsets,
+        ->create_op<ModelOpRefer>(name, input.ref_, shape, strides, offsets,
                                   padded_shape)
         ->result_tensors()[0];
 }
diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp
index 8ed3ac247..aac22b71a 100644
--- a/ark/ops/ops_reshape.cpp
+++ b/ark/ops/ops_reshape.cpp
@@ -199,8 +199,8 @@ Tensor Model::reshape(Tensor input, const Dims &shape, bool allowzero,
     reshape_helper(input.ref_, Dims{inferred_shape}, allowzero, new_shape,
                    new_strides, new_offs);
     return impl_
-        ->create_op<ModelOpReshape>("", name, input.ref_, new_shape,
-                                    new_strides, new_offs)
+        ->create_op<ModelOpReshape>(name, input.ref_, new_shape, new_strides,
+                                    new_offs)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_rope.cpp b/ark/ops/ops_rope.cpp
index 36015aae5..06c1c915e 100644
--- a/ark/ops/ops_rope.cpp
+++ b/ark/ops/ops_rope.cpp
@@ -12,10 +12,9 @@ ModelOpRope::ModelOpRope(ModelTensorRef input, ModelTensorRef other,
     : ModelOpBroadcast2("Rope", input, other, output) {}
 
 Tensor Model::rope(Tensor input, Tensor other, Tensor output,
-                   const std::string &config, const std::string &name) {
+                   const std::string &name) {
     return impl_
-        ->create_op<ModelOpRope>(config, name, input.ref_, other.ref_,
-                                 output.ref_)
+        ->create_op<ModelOpRope>(name, input.ref_, other.ref_, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_scalar.cpp b/ark/ops/ops_scalar.cpp
index b5c10f1c3..944a7247c 100644
--- a/ark/ops/ops_scalar.cpp
+++ b/ark/ops/ops_scalar.cpp
@@ -115,21 +115,20 @@ std::vector<ModelOpArg> ModelOpScalarMul::impl_args([
 Tensor Model::constant(float val, const Dims &shape, DataType data_type,
                        const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAssign>("", name, val, shape, data_type.ref(),
+        ->create_op<ModelOpScalarAssign>(name, val, shape, data_type.ref(),
                                          nullptr)
         ->result_tensors()[0];
 }
 
-Tensor Model::copy(float val, Tensor output, const std::string &config,
-                   const std::string &name) {
+Tensor Model::copy(float val, Tensor output, const std::string &name) {
     if (output == NullTensor) {
         return impl_
-            ->create_op<ModelOpScalarAssign>(config, name, val, Dims{1},
-                                             FP32.ref(), output.ref())
+            ->create_op<ModelOpScalarAssign>(name, val, Dims{1}, FP32.ref(),
+                                             output.ref())
             ->result_tensors()[0];
     } else {
         return impl_
-            ->create_op<ModelOpScalarAssign>(config, name, val, output.shape(),
+            ->create_op<ModelOpScalarAssign>(name, val, output.shape(),
                                              output.data_type().ref(),
                                              output.ref())
             ->result_tensors()[0];
@@ -137,34 +136,30 @@ Tensor Model::copy(float val, Tensor output, const std::string &config,
 }
 
 Tensor Model::add(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarAdd>(name, input.ref_, value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::sub(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarAdd>(config, name, input.ref_, -value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarAdd>(name, input.ref_, -value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::mul(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(config, name, input.ref_, value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarMul>(name, input.ref_, value, output.ref_)
         ->result_tensors()[0];
 }
 
 Tensor Model::div(Tensor input, float value, Tensor output,
-                  const std::string &config, const std::string &name) {
+                  const std::string &name) {
     return impl_
-        ->create_op<ModelOpScalarMul>(config, name, input.ref_, 1 / value,
-                                      output.ref_)
+        ->create_op<ModelOpScalarMul>(name, input.ref_, 1 / value, output.ref_)
         ->result_tensors()[0];
 }
 
diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp
index c659761d9..d0f7581cc 100644
--- a/ark/ops/ops_transpose.cpp
+++ b/ark/ops/ops_transpose.cpp
@@ -124,10 +124,9 @@ Json ModelOpTranspose::default_config([
 }
 
 Tensor Model::transpose(Tensor input, const std::vector<int64_t> &permutation,
-                        Tensor output, const std::string &config,
-                        const std::string &name) {
+                        Tensor output, const std::string &name) {
     return impl_
-        ->create_op<ModelOpTranspose>(config, name, input.ref_, permutation,
+        ->create_op<ModelOpTranspose>(name, input.ref_, permutation,
                                       output.ref_)
         ->result_tensors()[0];
 }
diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
index f41304e85..d4a080c84 100644
--- a/examples/llama/model_7b_b1_s2048.py
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -90,7 +90,7 @@ def __init__(
         self.weight = ark.parameter([1, 1, dim], ark.fp32)
 
     def forward(self, x):
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sync=False,
             config={
@@ -100,12 +100,12 @@ def forward(self, x):
                 "Granularity": 7,
             },
         ):
-            with ark.PlanManager(config={"Tile": [1, 4096]}):
+            with ark.PlannerContext(config={"Tile": [1, 4096]}):
                 x = ark.cast(x, ark.fp32)
                 x2 = ark.mul(x, x)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 mean = ark.reduce_mean(x2, axis=-1)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             config={
                 "NumWarps": 1,
                 "SramBytes": 0,
@@ -114,7 +114,7 @@ def forward(self, x):
             }
         ):
             rrms = ark.rsqrt(mean)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sync=False,
             config={
@@ -356,7 +356,7 @@ def __init__(
 
     def forward(self, x):
         # self.w2(F.silu(self.w1(x)) * self.w3(x))
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -365,13 +365,13 @@ def forward(self, x):
                 "NumTasks": 688,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x1 = self.w1(x)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 x1 = Silu()(x1)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -380,11 +380,11 @@ def forward(self, x):
                 "NumTasks": 688,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x2 = self.w3(x)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 x3 = ark.mul(x1, x2)
         x4 = self.w2(x3)
         return x4
@@ -404,7 +404,7 @@ def __init__(self):
         super(Softmax, self).__init__()
 
     def forward(self, input):
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 0],
             sync=False,
@@ -414,14 +414,14 @@ def forward(self, input):
                 "NumTasks": 65536,
             },
         ):
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 max = ark.reduce_max(input, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
                 output = ark.sub(input, max)
                 output = ark.exp(output)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
+            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
                 sum = ark.reduce_sum(output, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
+            with ark.PlannerContext(config={"Tile": [1, 2048]}):
                 output = ark.div(output, sum)
             return output
 
@@ -486,50 +486,50 @@ def forward(
     ):
         bsz, seqlen, _ = x.shape()
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
             config={"NumWarps": 4, "NumTasks": 256},
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xq = self.wq(x)
             xq = ark.reshape(
                 xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
             )
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 if freqs_cis is not None:
                     xq = ark.rope(xq, freqs_cis)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 xq = ark.transpose(xq, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
             config={"NumWarps": 4, "NumTasks": 256},
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xk = self.wk(x)
             xk = ark.reshape(
                 xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 if freqs_cis is not None:
                     xk = ark.rope(xk, freqs_cis)
             keys = xk
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 keys = ark.transpose(keys, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
@@ -540,7 +540,7 @@ def forward(
                 "TileShapeMNK": [256, 128, 32],
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 xv = self.wv(x)
@@ -548,12 +548,12 @@ def forward(
                 xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
             values = xv
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 values = ark.transpose(values, [0, 2, 1, 3])
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 8],
             sram_range=[0, 49344],
             sync=False,
@@ -563,11 +563,11 @@ def forward(
                 "Granularity": 2,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 scores = ark.matmul(xq, keys, transpose_other=True)
-            with ark.PlanManager(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
                 scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
 
         if mask is not None:
@@ -575,7 +575,7 @@ def forward(
 
         scores = Softmax()(scores)
 
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             sram_range=[0, 24672],
             sync=False,
@@ -584,11 +584,11 @@ def forward(
                 "NumTasks": 256,
             },
         ):
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 output = ark.matmul(scores, values)
-            with ark.PlanManager(
+            with ark.PlannerContext(
                 config={"SramBytes": 0, "Tile": [256, 1, 128]}
             ):
                 output = ark.transpose(output, [0, 2, 1, 3])
@@ -634,7 +634,7 @@ def forward(
     ):
         attention_norm_x = self.attention_norm(x)
         h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             config={
                 "NumWarps": 4,
@@ -645,7 +645,7 @@ def forward(
         ):
             h = ark.add(x, h)
         ff = self.feed_forward(self.ffn_norm(h))
-        with ark.PlanManager(
+        with ark.PlannerContext(
             warp_range=[0, 4],
             config={
                 "NumWarps": 4,
diff --git a/examples/tutorial/plan_manager_tutorial.py b/examples/tutorial/plan_manager_tutorial.py
deleted file mode 100644
index c840ce0c0..000000000
--- a/examples/tutorial/plan_manager_tutorial.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import time
-import torch
-import torch.nn.functional as F
-
-
-class VanillaSoftmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        max = ark.reduce_max(input, axis=-1)
-        output = ark.sub(input, max)
-        output = ark.exp(output)
-        sum = ark.reduce_sum(output, axis=-1)
-        output = ark.div(output, sum)
-        return output
-
-
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with ark.PlanManager(
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "NumTasks": 65536,
-            },
-        ):
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
-                max = ark.reduce_max(input, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
-                output = ark.sub(input, max)
-                output = ark.exp(output)
-            with ark.PlanManager(config={"ImplType": "WarpWise"}):
-                sum = ark.reduce_sum(output, axis=-1)
-            with ark.PlanManager(config={"Tile": [1, 2048]}):
-                output = ark.div(output, sum)
-            return output
-
-
-def eval(tensor: ark.Tensor):
-    with ark.Runtime() as rt:
-        rt.launch()
-        rt.run()
-        return tensor.to_torch()
-
-
-def perf():
-    with ark.Runtime() as rt:
-        rt.launch()
-
-        start = time.time()
-        rt.run(iter=1000)
-        end = time.time()
-        return (end - start) / 1000
-
-
-if __name__ == "__main__":
-    ark.init()
-
-    shape = (32, 2048, 2048)
-
-    input = torch.randn(*shape).to("cuda:0")
-
-    output = Softmax()(ark.Tensor.from_torch(input))
-
-    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-        print("Correct result")
-    else:
-        print("Incorrect result")
-
-    print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/ark/plan_manager.py b/python/ark/plan_manager.py
deleted file mode 100644
index 80e615ab8..000000000
--- a/python/ark/plan_manager.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import json
-from typing import List, Dict, Any
-from .model import Model
-from ._ark_core import _PlanManager
-
-
-class PlanManager(_PlanManager):
-    def __init__(self, **kwargs):
-        """
-        Plan manager for specifying the parallelization and tiling configuration of the operators in the context.
-
-        Args:
-            processor_range (List[int], optional): The range of processors to be used. Defaults to None.
-            warp_range (List[int], optional): The range of warps to be used. Defaults to None.
-            sram_range (List[int], optional): The range of SRAMs to be used. Defaults to None.
-            sync (bool, optional): Whether to synchronize the execution. Defaults to True.
-            config (Dict[str, Any], optional): The configuration for the operators. Defaults to None.
-        """
-        super().__init__(Model.get_model(), json.dumps(kwargs))
-
-    def __enter__(self) -> "PlanManager":
-        """
-        Enter the plan manager.
-        """
-        return self
-
-    def __exit__(self, exc_type, exc_value, exc_tb):
-        """
-        Exit the plan manager.
-        """
-        del self
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index f064a5988..960223c64 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,6 +3,7 @@
 
 import logging
 from enum import Enum
+from typing import Dict, List
 
 from _ark_core import _Executor
 from .planner import Planner, Plan
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 5a22d6a18..c224a3d5b 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -19,100 +19,89 @@ void register_model(py::module &m) {
         .def("compress", &ark::Model::compress)
         .def("add",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::add),
+                               const std::string &>(&ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("add",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::add),
+                               const std::string &>(&ark::Model::add),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("cast", &ark::Model::cast, py::arg("input"), py::arg("data_type"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("constant", &ark::Model::constant, py::arg("value"),
              py::arg("shape"), py::arg("data_type"), py::arg("name"))
         .def("copy",
-             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &,
-                               const std::string &>(&ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("config"),
-             py::arg("name"))
+             py::overload_cast<ark::Tensor, ark::Tensor, const std::string &>(
+                 &ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("name"))
         .def("copy",
-             py::overload_cast<float, ark::Tensor, const std::string &,
-                               const std::string &>(&ark::Model::copy),
-             py::arg("input"), py::arg("output"), py::arg("config"),
-             py::arg("name"))
+             py::overload_cast<float, ark::Tensor, const std::string &>(
+                 &ark::Model::copy),
+             py::arg("input"), py::arg("output"), py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::div),
+                               const std::string &>(&ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("div",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::div),
+                               const std::string &>(&ark::Model::div),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
-        .def("embedding", &ark::Model::embedding, py::arg("input"),
-             py::arg("weight"), py::arg("output"), py::arg("config"),
              py::arg("name"))
+        .def("embedding", &ark::Model::embedding, py::arg("input"),
+             py::arg("weight"), py::arg("output"), py::arg("name"))
         .def("exp", &ark::Model::exp, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("gelu", &ark::Model::gelu, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("identity", &ark::Model::identity, py::arg("input"),
              py::arg("deps"), py::arg("name"))
         .def("matmul", &ark::Model::matmul, py::arg("input"), py::arg("other"),
              py::arg("output"), py::arg("trans_input"), py::arg("trans_other"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::mul),
+                               const std::string &>(&ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("mul",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::mul),
+                               const std::string &>(&ark::Model::mul),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reduce_mean", &ark::Model::reduce_mean, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("reshape", &ark::Model::reshape, py::arg("input"),
              py::arg("shape"), py::arg("allowzero"), py::arg("name"))
         .def("rope", &ark::Model::rope, py::arg("input"), py::arg("other"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("rsqrt", &ark::Model::rsqrt, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sharding", &ark::Model::sharding, py::arg("input"),
              py::arg("axis"), py::arg("dim_per_shard"), py::arg("name"))
         .def("sigmoid", &ark::Model::sigmoid, py::arg("input"),
-             py::arg("output"), py::arg("config"), py::arg("name"))
+             py::arg("output"), py::arg("name"))
         .def("sqrt", &ark::Model::sqrt, py::arg("input"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, ark::Tensor, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::sub),
+                               const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
-             py::arg("config"), py::arg("name"))
+             py::arg("name"))
         .def("sub",
              py::overload_cast<ark::Tensor, float, ark::Tensor,
-                               const std::string &, const std::string &>(
-                 &ark::Model::sub),
+                               const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
         .def("tensor",
diff --git a/python/plan_manager_py.cpp b/python/plan_manager_py.cpp
deleted file mode 100644
index 34aa0b77c..000000000
--- a/python/plan_manager_py.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <pybind11/operators.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-
-#include <ark/plan_manager.hpp>
-
-namespace py = pybind11;
-
-void register_plan_manager(py::module &m) {
-    py::class_<ark::PlanManager>(m, "_PlanManager")
-        .def(py::init<ark::Model&, const std::string&>());
-}

From 28b83953ae26b8554fc8b822df8e96dd8bf04091 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 6 Aug 2024 14:33:23 -0700
Subject: [PATCH 053/106] Update runtime.py

---
 python/ark/runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 96c6f470a..e40750260 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -98,7 +98,7 @@ def launch(
                     _RuntimeState.executor.destroy()
 
             _RuntimeState.executor = Executor(
-                gpu_id,
+                device_id,
                 stream,
                 "ArkRuntime",
                 plan,

From 11901c4a3f49469ede51e992b8b1d2fc1f2c1e3b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 7 Aug 2024 09:36:45 +0000
Subject: [PATCH 054/106] fix

---
 python/ark/runtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index e40750260..495fc1c24 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -101,7 +101,7 @@ def launch(
                 device_id,
                 stream,
                 "ArkRuntime",
-                plan,
+                str(plan),
                 loop_mode,
             )
             self.executor = _RuntimeState.executor

From c0cbf19c4181cb697d5e3edc9db1198160bde788 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 8 Aug 2024 08:06:34 +0000
Subject: [PATCH 055/106] lint

---
 examples/llama/model_7b_b1_s2048.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
index d4a080c84..73d349ccc 100644
--- a/examples/llama/model_7b_b1_s2048.py
+++ b/examples/llama/model_7b_b1_s2048.py
@@ -369,7 +369,9 @@ def forward(self, x):
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x1 = self.w1(x)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 x1 = Silu()(x1)
         with ark.PlannerContext(
             warp_range=[0, 8],
@@ -384,7 +386,9 @@ def forward(self, x):
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 x2 = self.w3(x)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 x3 = ark.mul(x1, x2)
         x4 = self.w2(x3)
         return x4
@@ -504,7 +508,9 @@ def forward(
             ):
                 if freqs_cis is not None:
                     xq = ark.rope(xq, freqs_cis)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 xq = ark.transpose(xq, [0, 2, 1, 3])
 
         with ark.PlannerContext(
@@ -526,7 +532,9 @@ def forward(
                 if freqs_cis is not None:
                     xk = ark.rope(xk, freqs_cis)
             keys = xk
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 keys = ark.transpose(keys, [0, 2, 1, 3])
 
         with ark.PlannerContext(
@@ -567,7 +575,9 @@ def forward(
                 config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
             ):
                 scores = ark.matmul(xq, keys, transpose_other=True)
-            with ark.PlannerContext(config={"SramBytes": 0, "Tile": [256, 128]}):
+            with ark.PlannerContext(
+                config={"SramBytes": 0, "Tile": [256, 128]}
+            ):
                 scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
 
         if mask is not None:

From 8583d1bfd24699ec65e8c7933e9cb564de08844d Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 10 Aug 2024 09:17:15 +0000
Subject: [PATCH 056/106] updates

---
 ark/api/executor.cpp                  |   4 +
 ark/include/ark/executor.hpp          |   5 +
 examples/tutorial/planner_tutorial.py |  13 +-
 python/CMakeLists.txt                 |  13 ++
 python/ark/__init__.py                |   5 +-
 python/ark/data_type.py               |   2 +-
 python/ark/init.py                    |   2 +-
 python/ark/model.py                   |   2 +-
 python/ark/module.py                  |   4 +-
 python/ark/ops.py                     |  97 +++----------
 python/ark/tensor.py                  |  98 +++++--------
 python/ark_py.cpp                     |   2 -
 python/executor_py.cpp                | 188 ++++++++++++++-----------
 python/tensor_py.cpp                  |  33 ++++-
 python/unittest/test.py               |   2 +-
 python/unittest/test_runtime.py       | 192 +++++++++++++-------------
 python/unittest/test_tensor.py        |  23 +++
 17 files changed, 349 insertions(+), 336 deletions(-)
 create mode 100644 python/unittest/test_tensor.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index e77eada96..50686c434 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -154,6 +154,8 @@ class Executor::Impl {
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
+    std::shared_ptr<GpuMemory> buffer() const { return buffer_; }
+
     std::string plan() const { return plan_json_.dump_pretty(); }
 
     void compile();
@@ -934,6 +936,8 @@ int Executor::device_id() const { return impl_->device_id(); }
 
 Stream Executor::stream() const { return impl_->stream(); }
 
+std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
+
 std::string Executor::plan() const { return impl_->plan(); }
 
 void Executor::compile() { impl_->compile(); }
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 14ca87618..02a67cd26 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -15,6 +15,8 @@ namespace ark {
 
 using Stream = void *;
 
+class GpuMemory;
+
 /// Convenience class for executing a model.
 class Executor {
    public:
@@ -31,6 +33,9 @@ class Executor {
     /// Return the stream of the executor.
     Stream stream() const;
 
+    /// Return the buffer of the executor.
+    std::shared_ptr<GpuMemory> buffer() const;
+
     /// Return the plan string.
     std::string plan() const;
 
diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py
index 1f6c3ac58..6153aaf8e 100644
--- a/examples/tutorial/planner_tutorial.py
+++ b/examples/tutorial/planner_tutorial.py
@@ -69,14 +69,13 @@ def perf():
 
     shape = (32, 2048, 2048)
 
-    # input = torch.randn(*shape).to("cuda:0")
-    input = ark.tensor(shape)
+    input = torch.randn(*shape).to("cuda:0")
 
-    output = Softmax()(input)
+    output = Softmax()(ark.Tensor.from_torch(input))
 
-    # if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
-    #     print("Correct result")
-    # else:
-    #     print("Incorrect result")
+    if torch.allclose(eval(output), F.softmax(input, dim=-1), atol=1e-5):
+        print("Correct result")
+    else:
+        print("Incorrect result")
 
     print(f"Performance: {(perf() * 1e3):.3f} ms/iter")
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index bd25d01e6..2e160f8d1 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -21,3 +21,16 @@ pybind11_add_module(ark_py ${BIND_SOURCES})
 set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
 target_link_libraries(ark_py PRIVATE ark_static)
 target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
+target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
+
+if(ARK_USE_CUDA)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
+endif()
+
+if(ARK_USE_ROCM)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        /opt/rocm/include
+    )
+endif()
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index c20b50b8c..68b03ab29 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,12 +1,15 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-from . import _ark_core
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+import _ark_core
 from .model import Model
 
 
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 8ab982106..41c4201c3 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import numpy
-from . import _ark_core
+import _ark_core
 
 try:
     import torch
diff --git a/python/ark/init.py b/python/ark/init.py
index 32f530791..dbf7c1569 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from . import _ark_core
+import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index 87af88f49..e6208fc16 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from ._ark_core import _Model
+from _ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/module.py b/python/ark/module.py
index faeeea40d..d797da72c 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -5,7 +5,7 @@
 import numpy as np
 from typing import Any, Dict, List, Union
 from .tensor import Tensor, Parameter
-from .runtime import Runtime, DefaultPlanner
+from .runtime import Runtime, Planner
 from .ops import tensor
 from .data_type import DataType
 
@@ -183,7 +183,7 @@ def forward(self, *args: Any, **kwargs: Any) -> Any:
             self.built_forward = True
 
         with Runtime.get_runtime() as rt:
-            rt.launch(plan=DefaultPlanner().plan())
+            rt.launch(plan=Planner().plan())
             for tns, arg in zip(self.forward_input_tensor_args, args):
                 tns.copy(arg)
             for key, value in self.forward_input_tensor_kwargs.items():
diff --git a/python/ark/ops.py b/python/ark/ops.py
index f890e5d1b..7d98f51c2 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import json
 from typing import Any, Dict, List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
@@ -13,12 +12,6 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
-def _config_to_str(config: Union[str, Dict[str, Any]]) -> str:
-    if isinstance(config, str):
-        return config
-    return json.dumps(config)
-
-
 def _tensor(
     shape: Iterable[int],
     dtype: DataType = fp32,
@@ -59,7 +52,6 @@ def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "add",
 ) -> Union[Tensor, float]:
     """
@@ -83,14 +75,12 @@ def add(
         return input + other
     else:
         return Tensor(
-            Model.get_model().copy(
-                input + other, output._tensor, _config_to_str(config), name
-            )
+            Model.get_model().copy(input + other, output._tensor, name)
         )
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().add(a, b, output, _config_to_str(config), name),
+        Model.get_model().add(a, b, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -99,16 +89,13 @@ def cast(
     input: Tensor,
     dtype: DataType,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "cast",
 ) -> Tensor:
     """Type casting."""
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(
-            input._tensor, dtype.ctype(), output, _config_to_str(config), name
-        ),
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -130,7 +117,6 @@ def constant(
 def copy(
     input: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "copy",
 ) -> Tensor:
     """Data caopy."""
@@ -139,7 +125,7 @@ def copy(
     if isinstance(input, Tensor):
         intput = intput._tensor
     return Tensor(
-        Model.get_model().copy(intput, output, _config_to_str(config), name),
+        Model.get_model().copy(intput, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -148,7 +134,6 @@ def div(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "div",
 ) -> Tensor:
     """
@@ -164,9 +149,7 @@ def div(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().div(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().div(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -175,7 +158,6 @@ def embedding(
     input: Tensor,
     weight: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
@@ -185,7 +167,7 @@ def embedding(
         output = output._tensor
     return Tensor(
         Model.get_model().embedding(
-            input._tensor, weight._tensor, output, _config_to_str(config), name
+            input._tensor, weight._tensor, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -194,7 +176,6 @@ def embedding(
 def exp(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "exp",
 ) -> Tensor:
     """
@@ -205,9 +186,7 @@ def exp(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().exp(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().exp(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -215,7 +194,6 @@ def exp(
 def gelu(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "gelu",
 ) -> Tensor:
     """
@@ -229,9 +207,7 @@ def gelu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().gelu(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().gelu(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -263,7 +239,6 @@ def matmul(
     output: Tensor = NullTensor,
     transpose_input: bool = False,
     transpose_other: bool = False,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "matmul",
 ) -> Tensor:
     """
@@ -286,7 +261,6 @@ def matmul(
             output,
             transpose_input,
             transpose_other,
-            _config_to_str(config),
             name,
         ),
         runtime_id=input.runtime_id,
@@ -297,7 +271,6 @@ def mul(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "mul",
 ) -> Tensor:
     """
@@ -313,9 +286,7 @@ def mul(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().mul(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().mul(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -332,7 +303,6 @@ def reduce_max(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_max",
 ) -> Tensor:
     """
@@ -345,7 +315,7 @@ def reduce_max(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_max(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -356,7 +326,6 @@ def reduce_mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_mean",
 ) -> Tensor:
     """
@@ -369,7 +338,7 @@ def reduce_mean(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_mean(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -380,7 +349,6 @@ def reduce_sum(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "reduce_sum",
 ) -> Tensor:
     """
@@ -395,7 +363,7 @@ def reduce_sum(
         output = output._tensor
     return Tensor(
         Model.get_model().reduce_sum(
-            input._tensor, axis, keepdims, output, _config_to_str(config), name
+            input._tensor, axis, keepdims, output, name
         ),
         runtime_id=input.runtime_id,
     )
@@ -404,7 +372,6 @@ def reduce_sum(
 def relu(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "relu",
 ) -> Tensor:
     """
@@ -416,9 +383,7 @@ def relu(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().relu(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().relu(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -457,7 +422,6 @@ def rope(
     input: Tensor,
     other: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "rope",
 ) -> Tensor:
     """
@@ -470,9 +434,7 @@ def rope(
     if input.runtime_id != other.runtime_id:
         raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(
-            input._tensor, other._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().rope(input._tensor, other._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -480,7 +442,6 @@ def rope(
 def rsqrt(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "rsqrt",
 ) -> Tensor:
     """
@@ -491,9 +452,7 @@ def rsqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().rsqrt(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().rsqrt(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -521,7 +480,6 @@ def sharding(
 def sigmoid(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sigmoid",
 ) -> Tensor:
     """
@@ -533,9 +491,7 @@ def sigmoid(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sigmoid(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().sigmoid(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -543,7 +499,6 @@ def sigmoid(
 def sqrt(
     input: Tensor,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sqrt",
 ) -> Tensor:
     """
@@ -554,9 +509,7 @@ def sqrt(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().sqrt(
-            input._tensor, output, _config_to_str(config), name
-        ),
+        Model.get_model().sqrt(input._tensor, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -565,7 +518,6 @@ def sub(
     input: Tensor,
     other: Union[Tensor, float],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "sub",
 ) -> Tensor:
     """
@@ -581,9 +533,7 @@ def sub(
             raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
     return Tensor(
-        Model.get_model().sub(
-            input._tensor, other, output, _config_to_str(config), name
-        ),
+        Model.get_model().sub(input._tensor, other, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -613,7 +563,6 @@ def transpose(
     input: Tensor,
     perm: Iterable[int],
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "transpose",
 ) -> Tensor:
     """
@@ -633,9 +582,7 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(
-            input._tensor, perm, output, _config_to_str(config), name
-        ),
+        Model.get_model().transpose(input._tensor, perm, output, name),
         runtime_id=input.runtime_id,
     )
 
@@ -648,7 +595,6 @@ def mean(
     axis: int,
     keepdims: bool = True,
     output: Tensor = NullTensor,
-    config: Union[str, Dict[str, Any]] = "",
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
@@ -764,9 +710,10 @@ def all_reduce(
     "reshape",
     "identity",
     "sharding",
-    "reduce_sum",
-    "reduce_mean",
+    "noop",
     "reduce_max",
+    "reduce_mean",
+    "reduce_sum",
     "layernorm",
     "softmax",
     "transpose",
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index eed7a4259..089d3eaed 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from ._ark_core import _Dims, _Tensor, _NullTensor
+from _ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model
@@ -102,63 +102,6 @@ def to_numpy(
         rt.executor.tensor_read(self._tensor, ndarray, stream)
         return ndarray
 
-    def to_torch(
-        self, tensor: torch.Tensor = None, stream: int = 0
-    ) -> torch.Tensor:
-        """ """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        rt = Runtime.get_runtime(self.runtime_id)
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.to_torch()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        torch_type = self.dtype().to_torch()
-        if tensor is None:
-            dev_name = f"cuda:{rt.executor.device_id()}"
-            tensor = torch.zeros(
-                self.shape(), dtype=torch_type, device=torch.device(dev_name)
-            )
-        elif list(tensor.shape) != self.shape():
-            raise ValueError(
-                f"torch tensor shape {list(tensor.shape)} "
-                f"does not match the tensor {self.shape()}"
-            )
-        elif tensor.dtype != torch_type:
-            raise ValueError(
-                f"torch tensor dtype {tensor.dtype} "
-                f"does not match the tensor {torch_type}"
-            )
-        elif not tensor.is_contiguous():
-            raise ValueError("torch tensor is not contiguous in memory")
-        elif tensor.numel() != self.nelems():
-            raise ValueError(
-                f"torch tensor size {tensor.numel()} "
-                f"does not match the tensor {self.nelems()}"
-            )
-        tensor_bytes = self.nelems() * self.dtype().element_size()
-        rt.executor.tensor_read(
-            self._tensor, tensor.data_ptr(), tensor_bytes, stream, True
-        )
-        return tensor
-
-    def get_torch_view(self) -> torch.Tensor:
-        """
-        Returns a torch tensor that shares the same memory with the device tensor.
-        """
-        if _no_torch:
-            raise ImportError("torch is not available")
-        rt = Runtime.get_runtime(self.runtime_id)
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.get_torch_view()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        dl_tensor = rt.executor.get_dl_tensor(self._tensor)
-        torch_view = torch.utils.dlpack.from_dlpack(dl_tensor)
-        return torch_view
-
     def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
@@ -177,6 +120,37 @@ def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         rt.executor.tensor_write(self._tensor, ndarray, stream)
         return self
 
+    def to_dlpack(self):
+        """
+        Returns a DLPack tensor that shares the same memory with the device tensor.
+        """
+        rt = Runtime.get_runtime(self.runtime_id)
+        if not rt.launched():
+            raise RuntimeError(
+                "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
+                "usable only after you call `Runtime.launch()`."
+            )
+        return rt.executor.tensor_to_dlpack(self._tensor)
+
+    @staticmethod
+    def from_dlpack(ext_tensor, runtime_id: int = -1) -> "Tensor":
+        """
+        Copies the tensor from a DLPack tensor to the device.
+        """
+        return Tensor(_Tensor(ext_tensor), runtime_id=runtime_id)
+
+    def to_torch(self) -> torch.Tensor:
+        """
+        Returns a torch tensor that shares the same memory with the device tensor.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        dl_capsule = self.to_dlpack()
+        torch_view = torch.utils.dlpack.from_dlpack(dl_capsule)
+        # Keep dl_capsule alive not to free the memory
+        torch_view.__ark_buffer__ = dl_capsule
+        return torch_view
+
     @staticmethod
     def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
         """
@@ -188,10 +162,10 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        ark_dtype = DataType.from_torch(tensor.dtype)
-        dl_capsule = torch.utils.dlpack.to_dlpack(tensor)
-        ark_tensor = _Tensor(dl_capsule, ark_dtype.ctype())
-        return Tensor(ark_tensor, runtime_id=runtime_id)
+        return Tensor.from_dlpack(
+            torch.utils.dlpack.to_dlpack(tensor),
+            runtime_id=runtime_id,
+        )
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 75788ba55..1bc4255d6 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -7,7 +7,6 @@
 
 namespace py = pybind11;
 
-extern void register_plan_manager(py::module &m);
 extern void register_data_type(py::module &m);
 extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
@@ -23,7 +22,6 @@ extern void register_version(py::module &m);
 PYBIND11_MODULE(_ark_core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
-    register_plan_manager(m);
     register_data_type(m);
     register_dims(m);
     register_error(m);
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 126970d89..d90825e21 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -8,8 +8,10 @@
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
-#include <iostream>
-#include <stdexcept>
+
+#include "gpu/gpu_memory.hpp"
+#include "logging.hpp"
+
 namespace py = pybind11;
 
 static void tensor_write(ark::Executor *exe, const ark::Tensor &tensor,
@@ -42,37 +44,37 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
                      reinterpret_cast<ark::Stream>(stream), is_d2d);
 }
 
-static DLDataType get_dl_dtype(const ark::DataType &ark_data_type) {
-    DLDataType dl_data_type;
-    dl_data_type.lanes = 1;
-    if (ark_data_type == ark::FP32) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::FP16) {
-        dl_data_type.code = kDLFloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == ark::BF16) {
-        dl_data_type.code = kDLBfloat;
-        dl_data_type.bits = 16;
-    } else if (ark_data_type == ark::INT32) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::UINT32) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 32;
-    } else if (ark_data_type == ark::INT8) {
-        dl_data_type.code = kDLInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == ark::UINT8) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
-    } else if (ark_data_type == ark::BYTE) {
-        dl_data_type.code = kDLUInt;
-        dl_data_type.bits = 8;
+static DLDataType to_dl_dtype(const ark::DataType &ark_dtype) {
+    DLDataType dl_dtype;
+    dl_dtype.lanes = 1;
+    if (ark_dtype == ark::FP32) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::FP16) {
+        dl_dtype.code = kDLFloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::BF16) {
+        dl_dtype.code = kDLBfloat;
+        dl_dtype.bits = 16;
+    } else if (ark_dtype == ark::INT32) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::UINT32) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 32;
+    } else if (ark_dtype == ark::INT8) {
+        dl_dtype.code = kDLInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::UINT8) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
+    } else if (ark_dtype == ark::BYTE) {
+        dl_dtype.code = kDLUInt;
+        dl_dtype.bits = 8;
     } else {
-        throw std::runtime_error("unexpected error");
+        ERR(ark::InternalError, "unexpected");
     }
-    return dl_data_type;
+    return dl_dtype;
 }
 
 static DLDeviceType get_device_type() {
@@ -85,66 +87,84 @@ static DLDeviceType get_device_type() {
 #endif
 }
 
-static DLManagedTensor *to_dlpack(ark::Executor &exe,
-                                  const ark::Tensor &tensor) {
-    DLTensor dl_tensor;
-    dl_tensor.data = reinterpret_cast<void *>(exe.tensor_address(tensor));
-    size_t offset_in_elements =
-        tensor.offsets().is_no_dim() ? 0 : tensor.offsets().vector()[0];
-    dl_tensor.byte_offset = offset_in_elements * tensor.data_type().bytes();
-    dl_tensor.device.device_type = get_device_type();
-    dl_tensor.device.device_id = static_cast<int32_t>(exe.device_id());
-    dl_tensor.ndim = static_cast<int32_t>(tensor.shape().ndims());
-    dl_tensor.dtype = get_dl_dtype(tensor.data_type());
-
-    dl_tensor.shape =
-        tensor.shape().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    dl_tensor.strides =
-        tensor.strides().is_no_dim() ? nullptr : new int64_t[dl_tensor.ndim];
-    auto shape = tensor.shape();
-    if (dl_tensor.shape) {
-        for (int i = 0; i < dl_tensor.ndim; ++i) {
-            dl_tensor.shape[i] = shape[i];
-        }
-    }
-    if (dl_tensor.strides) {
-        dl_tensor.strides[dl_tensor.ndim - 1] = 1;
-        for (int i = dl_tensor.ndim - 2; i >= 0; --i) {
-            dl_tensor.strides[i] =
-                dl_tensor.shape[i + 1] * dl_tensor.strides[i + 1];
+namespace ark {
+
+class SharedTensor {
+   public:
+    SharedTensor(Executor &exe, const Tensor &tensor);
+    ~SharedTensor() = default;
+
+    DLTensor dl_tensor() const;
+
+   private:
+    std::shared_ptr<GpuMemory> buffer_;
+    void *data_;
+    int device_id_;
+    DataType dtype_;
+    std::shared_ptr<std::vector<int64_t>> shape_;
+    std::shared_ptr<std::vector<int64_t>> strides_;
+    std::shared_ptr<std::vector<int64_t>> offsets_;
+};
+
+SharedTensor::SharedTensor(Executor &exe, const Tensor &tensor) {
+    buffer_ = exe.buffer();
+    data_ = reinterpret_cast<void *>(exe.tensor_address(tensor));
+    device_id_ = exe.device_id();
+    dtype_ = tensor.data_type();
+    shape_ = std::make_shared<std::vector<int64_t>>(tensor.shape().vector());
+    offsets_ =
+        std::make_shared<std::vector<int64_t>>(tensor.offsets().vector());
+
+    strides_ = std::make_shared<std::vector<int64_t>>();
+    if (!shape_->empty()) {
+        int ndims = static_cast<int>(shape_->size());
+        strides_->resize(shape_->size());
+        strides_->back() = 1;
+        auto tmp = tensor.strides().vector();
+        for (int i = ndims - 2; i >= 0; --i) {
+            (*strides_)[i] = (*strides_)[i + 1] * tmp[i + 1];
         }
     }
-    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
-    dl_managed_tensor->dl_tensor = dl_tensor;
-    dl_managed_tensor->manager_ctx = nullptr;
-    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
-        if (self->dl_tensor.shape) {
-            delete[] self->dl_tensor.shape;
-            self->dl_tensor.shape = nullptr;
-        }
-        if (self->dl_tensor.strides) {
-            delete[] self->dl_tensor.strides;
-            self->dl_tensor.strides = nullptr;
-        }
-    };
-    return dl_managed_tensor;
 }
 
-void free_capsule(PyObject *capsule) {
-    const char *name = PyCapsule_GetName(capsule);
-    auto *dl_managed_tensor =
-        static_cast<DLManagedTensor *>(PyCapsule_GetPointer(capsule, name));
-    if (dl_managed_tensor) {
-        dl_managed_tensor->deleter(dl_managed_tensor);
-        dl_managed_tensor = nullptr;
-    }
+DLTensor SharedTensor::dl_tensor() const {
+    DLTensor dl_tensor;
+    dl_tensor.data = data_;
+    size_t offset_in_elements = offsets_->empty() ? 0 : offsets_->at(0);
+    dl_tensor.byte_offset = offset_in_elements * dtype_.bytes();
+    dl_tensor.device.device_type = get_device_type();
+    dl_tensor.device.device_id = device_id_;
+    dl_tensor.ndim = static_cast<int32_t>(shape_->size());
+    dl_tensor.dtype = to_dl_dtype(dtype_);
+    dl_tensor.shape = shape_->data();
+    dl_tensor.strides = strides_->data();
+    return dl_tensor;
 }
 
-py::capsule to_dlpack_capsule(ark::Executor &self, const ark::Tensor &tensor) {
-    DLManagedTensor *dl_managed_tensor = to_dlpack(self, tensor);
+}  // namespace ark
+
+static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tensor) {
+    auto shared_tensor = new ark::SharedTensor(self, tensor);
+    DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
+    dl_managed_tensor->dl_tensor = shared_tensor->dl_tensor();
+    dl_managed_tensor->manager_ctx = shared_tensor;
+    dl_managed_tensor->deleter = [](DLManagedTensor *self) {
+        if (self->manager_ctx) {
+            delete static_cast<ark::SharedTensor *>(self->manager_ctx);
+            self->manager_ctx = nullptr;
+        }
+    };
     const char *capsule_name = "dltensor";
     PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
-                                         capsule_name, free_capsule);
+                                         capsule_name, [](PyObject *capsule) {
+            const char *name = PyCapsule_GetName(capsule);
+            auto *dl_managed_tensor = static_cast<DLManagedTensor *>(
+                PyCapsule_GetPointer(capsule, name));
+            if (dl_managed_tensor) {
+                dl_managed_tensor->deleter(dl_managed_tensor);
+                dl_managed_tensor = nullptr;
+            }
+        });
     return py::reinterpret_steal<py::capsule>(dl_capsule);
 }
 
@@ -191,5 +211,5 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("get_dl_tensor", &to_dlpack_capsule);
+        .def("tensor_to_dlpack", &tensor_to_dlpack);
 }
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 16eb03421..e7f06592d 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -8,6 +8,8 @@
 
 #include <ark/tensor.hpp>
 
+#include "logging.hpp"
+
 namespace py = pybind11;
 
 struct DLTensorMetadata {
@@ -40,12 +42,37 @@ static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
     return metadata;
 }
 
+static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
+    if (dl_dtype.lanes != 1) {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    ark::DataType ark_dtype;
+    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
+        ark_dtype = ark::FP32;
+    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::FP16;
+    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::BF16;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::INT32;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::UINT32;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::INT8;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::UINT8;
+    } else {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    return ark_dtype;
+}
+
 void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
-        .def(py::init([](py::capsule capsule, const ark::DataType& dtype) {
+        .def(py::init([](py::capsule capsule) {
             DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
             if (!dl_tensor) {
-                throw std::runtime_error(
+                ERR(ark::InvalidUsageError,
                     "Capsule does not contain a DLManagedTensor");
             }
             DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
@@ -53,7 +80,7 @@ void register_tensor(py::module& m) {
             void* data_ptr = metadata.data_ptr;
             auto shape = metadata.shape;
 
-            return new ark::Tensor(data_ptr, device_id, shape, dtype);
+            return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
         }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
diff --git a/python/unittest/test.py b/python/unittest/test.py
index 238b16fb6..d56932b83 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -10,4 +10,4 @@
 from test_error import *
 from test_model import *
 from test_runtime import *
-from test_conversion import *
+from test_tensor import *
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 8c00b51f8..c3d15d1b9 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -20,99 +20,99 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-def test_multiple_runtime_launch():
-    ark.init()
-    num_runtimes = 5
-    for i in range(num_runtimes):
-        rt = ark.Runtime.get_runtime(i)
-        assert rt.launched() == False
-        rt.launch(plan=empty_plan, device_id=i)
-        assert rt.launched() == True
-    for i in range(num_runtimes):
-        rt = ark.Runtime.get_runtime(i)
-        assert rt.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_stop_runtime():
-    ark.init()
-    rt1 = ark.Runtime.get_runtime(1)
-    rt1.launch(plan=empty_plan, device_id=1)
-    rt2 = ark.Runtime.get_runtime(2)
-    rt2.launch(plan=empty_plan, device_id=2)
-    rt1.stop()
-    rt1.reset()
-    assert rt1.state == ark.Runtime.State.Init
-    assert rt2.state == ark.Runtime.State.LaunchedNotRunning
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_reset_runtime():
-    ark.init()
-    rt1 = ark.Runtime.get_runtime(0)
-    rt1.launch(plan=empty_plan, device_id=1)
-    rt2 = ark.Runtime.get_runtime(1)
-    rt2.launch(plan=empty_plan, device_id=2)
-    rt1.reset()
-    assert rt1.launched() == False
-    assert rt2.launched() == True
-    rt1.launch(plan=empty_plan)
-    assert rt1.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_multiple_runtimes_complex():
-    ark.init()
-    num_runtimes = 3
-    runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-    default_runtime = ark.Runtime.get_runtime()
-    runtime_list.append(default_runtime)
-    for i, rt in enumerate(runtime_list):
-        rt.launch(plan=empty_plan, device_id=i)
-        assert rt.launched() == True
-    runtime_list[0].stop()
-    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-    for rt in runtime_list[1:]:
-        assert rt.launched() == True
-    runtime_list[1].reset()
-    assert runtime_list[1].state == ark.Runtime.State.Init
-    assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-    assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-    runtime_list[1].launch(plan=empty_plan, device_id=1)
-    for rt in runtime_list:
-        assert rt.launched() == True
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_runtime_state_after_reset():
-    ark.init()
-    rt = ark.Runtime.get_runtime()
-    rt.launch(plan=empty_plan)
-    rt.reset()
-    assert rt.launched() == False
-    assert rt.running() == False
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_see_runtime_statuses():
-    ark.init()
-    num_runtimes = 3
-    runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-    runtime_statuses = ark.Runtime.see_runtime_statuses()
-    assert len(runtime_statuses) == num_runtimes
-    for i in range(num_runtimes):
-        assert i in runtime_statuses
-    for i, rt in enumerate(runtimes):
-        assert runtime_statuses[i] == rt
-    ark.Runtime.delete_all_runtimes()
-
-
-def test_multiple_runtimes_init():
-    ark.init()
-    runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
-    for rt in runtimes:
-        assert rt.state == ark.Runtime.State.Init
-    ark.init()
-    runtimes = ark.Runtime.see_runtime_statuses()
-    assert len(runtimes) == 0
-    ark.Runtime.delete_all_runtimes()
+# def test_multiple_runtime_launch():
+#     ark.init()
+#     num_runtimes = 5
+#     for i in range(num_runtimes):
+#         rt = ark.Runtime.get_runtime(i)
+#         assert rt.launched() == False
+#         rt.launch(plan=empty_plan, device_id=i)
+#         assert rt.launched() == True
+#     for i in range(num_runtimes):
+#         rt = ark.Runtime.get_runtime(i)
+#         assert rt.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_stop_runtime():
+#     ark.init()
+#     rt1 = ark.Runtime.get_runtime(1)
+#     rt1.launch(plan=empty_plan, device_id=1)
+#     rt2 = ark.Runtime.get_runtime(2)
+#     rt2.launch(plan=empty_plan, device_id=2)
+#     rt1.stop()
+#     rt1.reset()
+#     assert rt1.state == ark.Runtime.State.Init
+#     assert rt2.state == ark.Runtime.State.LaunchedNotRunning
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_reset_runtime():
+#     ark.init()
+#     rt1 = ark.Runtime.get_runtime(0)
+#     rt1.launch(plan=empty_plan, device_id=1)
+#     rt2 = ark.Runtime.get_runtime(1)
+#     rt2.launch(plan=empty_plan, device_id=2)
+#     rt1.reset()
+#     assert rt1.launched() == False
+#     assert rt2.launched() == True
+#     rt1.launch(plan=empty_plan)
+#     assert rt1.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_multiple_runtimes_complex():
+#     ark.init()
+#     num_runtimes = 3
+#     runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+#     default_runtime = ark.Runtime.get_runtime()
+#     runtime_list.append(default_runtime)
+#     for i, rt in enumerate(runtime_list):
+#         rt.launch(plan=empty_plan, device_id=i)
+#         assert rt.launched() == True
+#     runtime_list[0].stop()
+#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+#     for rt in runtime_list[1:]:
+#         assert rt.launched() == True
+#     runtime_list[1].reset()
+#     assert runtime_list[1].state == ark.Runtime.State.Init
+#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
+#     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
+#     runtime_list[1].launch(plan=empty_plan, device_id=1)
+#     for rt in runtime_list:
+#         assert rt.launched() == True
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_runtime_state_after_reset():
+#     ark.init()
+#     rt = ark.Runtime.get_runtime()
+#     rt.launch(plan=empty_plan)
+#     rt.reset()
+#     assert rt.launched() == False
+#     assert rt.running() == False
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_see_runtime_statuses():
+#     ark.init()
+#     num_runtimes = 3
+#     runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
+#     runtime_statuses = ark.Runtime.see_runtime_statuses()
+#     assert len(runtime_statuses) == num_runtimes
+#     for i in range(num_runtimes):
+#         assert i in runtime_statuses
+#     for i, rt in enumerate(runtimes):
+#         assert runtime_statuses[i] == rt
+#     ark.Runtime.delete_all_runtimes()
+
+
+# def test_multiple_runtimes_init():
+#     ark.init()
+#     runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
+#     for rt in runtimes:
+#         assert rt.state == ark.Runtime.State.Init
+#     ark.init()
+#     runtimes = ark.Runtime.see_runtime_statuses()
+#     assert len(runtimes) == 0
+#     ark.Runtime.delete_all_runtimes()
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
new file mode 100644
index 000000000..1acad43ee
--- /dev/null
+++ b/python/unittest/test_tensor.py
@@ -0,0 +1,23 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from unittest_common import pytest_ark
+import ark
+
+
+@pytest_ark(need_torch=True)
+def test_tensor_torch():
+    import torch
+
+    ones = torch.ones(2, 1024, device=torch.device("cuda:0"))
+
+    t = ark.Tensor.from_torch(ones)
+    t = ark.mul(t, 5)
+
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+
+        x = t.to_torch()
+
+    assert torch.allclose(x, ones * 5)

From 8c2562b3b7ddeb5736bd10be28768222b7ad9a56 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 21:22:35 +0000
Subject: [PATCH 057/106] remove runtime ID

---
 ark/api/tensor.cpp    |   4 +-
 python/ark/init.py    |   5 +-
 python/ark/ops.py     | 132 ++++++++++--------------------------------
 python/ark/runtime.py | 106 ++++++---------------------------
 python/ark/tensor.py  |  25 +++-----
 5 files changed, 61 insertions(+), 211 deletions(-)

diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4d33bd9f1..4b5bdfd55 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -57,14 +57,14 @@ Dims Tensor::padded_shape() const {
     return Dims();
 }
 
-const DataType& Tensor::data_type() const {
+const DataType &Tensor::data_type() const {
     if (ref_) {
         return DataType::from_name(ref_->data_type()->type_name());
     }
     return NONE;
 }
 
-std::ostream& operator<<(std::ostream& os, const Tensor& tensor) {
+std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
     } else {
diff --git a/python/ark/init.py b/python/ark/init.py
index dbf7c1569..a4a67e85d 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -9,6 +9,7 @@
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.runtime:
-        _RuntimeState.delete_all()
+    if _RuntimeState.runtime is not None:
+        del _RuntimeState.runtime
+        _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 7d98f51c2..5fe144150 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import Any, Dict, List, Iterable, Union
+from typing import List, Iterable, Union
 
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
@@ -61,8 +61,6 @@ def add(
     tensor_add = ark.add(tensor1, tensor2)
     """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         a = input._tensor
         b = other._tensor
     elif isinstance(input, Tensor):
@@ -79,10 +77,7 @@ def add(
         )
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().add(a, b, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().add(a, b, output, name))
 
 
 def cast(
@@ -95,8 +90,7 @@ def cast(
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().cast(input._tensor, dtype.ctype(), output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().cast(input._tensor, dtype.ctype(), output, name)
     )
 
 
@@ -105,12 +99,10 @@ def constant(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "constant",
-    runtime_id: int = -1,
 ) -> Tensor:
     """Constant."""
     return Tensor(
-        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
     )
 
 
@@ -124,10 +116,7 @@ def copy(
         output = output._tensor
     if isinstance(input, Tensor):
         intput = intput._tensor
-    return Tensor(
-        Model.get_model().copy(intput, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().copy(intput, output, name))
 
 
 def div(
@@ -145,13 +134,8 @@ def div(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().div(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().div(input._tensor, other, output, name))
 
 
 def embedding(
@@ -161,15 +145,10 @@ def embedding(
     name: str = "embedding",
 ) -> Tensor:
     """Embedding layer."""
-    if input.runtime_id != weight.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
-        Model.get_model().embedding(
-            input._tensor, weight._tensor, output, name
-        ),
-        runtime_id=input.runtime_id,
+        Model.get_model().embedding(input._tensor, weight._tensor, output, name)
     )
 
 
@@ -185,10 +164,7 @@ def exp(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().exp(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().exp(input._tensor, output, name))
 
 
 def gelu(
@@ -206,10 +182,7 @@ def gelu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().gelu(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().gelu(input._tensor, output, name))
 
 
 def identity(
@@ -224,13 +197,8 @@ def identity(
     for dep in deps:
         if not isinstance(dep, Tensor):
             raise TypeError("All dependencies should be a tensor")
-        if input.runtime_id != dep.runtime_id:
-            raise ValueError("All tensors must be on the same runtime")
         dep_tensors.append(dep._tensor)
-    return Tensor(
-        Model.get_model().identity(input._tensor, dep_tensors, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name))
 
 
 def matmul(
@@ -250,8 +218,6 @@ def matmul(
     Usage:
     tensor_matmul = ark.matmul(tensor1, tensor2)
     """
-    if input.runtime_id != other.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -262,8 +228,7 @@ def matmul(
             transpose_input,
             transpose_other,
             name,
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -282,13 +247,8 @@ def mul(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().mul(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().mul(input._tensor, other, output, name))
 
 
 def noop(input: Tensor, name: str = "noop"):
@@ -316,8 +276,7 @@ def reduce_max(
     return Tensor(
         Model.get_model().reduce_max(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -339,8 +298,7 @@ def reduce_mean(
     return Tensor(
         Model.get_model().reduce_mean(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -364,8 +322,7 @@ def reduce_sum(
     return Tensor(
         Model.get_model().reduce_sum(
             input._tensor, axis, keepdims, output, name
-        ),
-        runtime_id=input.runtime_id,
+        )
     )
 
 
@@ -382,10 +339,7 @@ def relu(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().relu(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().relu(input._tensor, output, name))
 
 
 def reshape(
@@ -413,8 +367,7 @@ def reshape(
     if len(shape) > 4:
         raise ValueError("Only support tensors with up to 4 dimensions")
     return Tensor(
-        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name)
     )
 
 
@@ -431,11 +384,8 @@ def rope(
     """
     if output is not NullTensor:
         output = output._tensor
-    if input.runtime_id != other.runtime_id:
-        raise ValueError("Tensors must be on the same runtime")
     return Tensor(
-        Model.get_model().rope(input._tensor, other._tensor, output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().rope(input._tensor, other._tensor, output, name)
     )
 
 
@@ -451,10 +401,7 @@ def rsqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().rsqrt(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().rsqrt(input._tensor, output, name))
 
 
 def sharding(
@@ -472,9 +419,7 @@ def sharding(
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
     )
-    return [
-        Tensor(_tensor, runtime_id=input.runtime_id) for _tensor in _tensor_list
-    ]
+    return [Tensor(_tensor) for _tensor in _tensor_list]
 
 
 def sigmoid(
@@ -490,10 +435,7 @@ def sigmoid(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().sigmoid(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sigmoid(input._tensor, output, name))
 
 
 def sqrt(
@@ -508,10 +450,7 @@ def sqrt(
     """
     if output is not NullTensor:
         output = output._tensor
-    return Tensor(
-        Model.get_model().sqrt(input._tensor, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sqrt(input._tensor, output, name))
 
 
 def sub(
@@ -529,13 +468,8 @@ def sub(
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
-        if input.runtime_id != other.runtime_id:
-            raise ValueError("Tensors must be on the same runtime")
         other = other._tensor
-    return Tensor(
-        Model.get_model().sub(input._tensor, other, output, name),
-        runtime_id=input.runtime_id,
-    )
+    return Tensor(Model.get_model().sub(input._tensor, other, output, name))
 
 
 def tensor(
@@ -546,7 +480,6 @@ def tensor(
     padded_shape: Iterable[int] = [],
     rank: int = -1,
     name: str = "",
-    runtime_id: int = -1,
 ) -> Tensor:
     """
     Construct a tensor with given shape and data type.
@@ -582,8 +515,7 @@ def transpose(
     if len(perm) > 4:
         raise ValueError("Only support perm up to 4 dimensions")
     return Tensor(
-        Model.get_model().transpose(input._tensor, perm, output, name),
-        runtime_id=input.runtime_id,
+        Model.get_model().transpose(input._tensor, perm, output, name)
     )
 
 
@@ -598,19 +530,17 @@ def mean(
     name: str = "mean",
 ) -> Tensor:
     """Alias of reduce_mean."""
-    return reduce_mean(input, axis, keepdims, output, config, name)
+    return reduce_mean(input, axis, keepdims, output, name)
 
 
 def ones(
     shape: Iterable[int],
     dtype: DataType = fp32,
-    name: str = "ones",
-    runtime_id: int = -1,
+    name: str = "ones"
 ) -> Tensor:
     """Ones."""
     return Tensor(
-        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
     )
 
 
@@ -621,14 +551,12 @@ def parameter(
     offsets: Iterable[int] = [],
     padded_shape: Iterable[int] = [],
     name: str = "",
-    runtime_id: int = -1,
 ) -> Parameter:
     """
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name),
-        runtime_id=runtime_id,
+        _tensor(shape, dtype, strides, offsets, padded_shape, name)
     )
 
 
@@ -665,12 +593,10 @@ def zeros(
     shape: Iterable[int],
     dtype: DataType = fp32,
     name: str = "zeros",
-    runtime_id: int = -1,
 ) -> Tensor:
     """Zeros."""
     return Tensor(
-        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name),
-        runtime_id=runtime_id,
+        Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
     )
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index c2e507bca..671953df1 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -14,36 +14,7 @@ class _RuntimeState:
     The _RuntimeState class is used to store the state of the model.
     """
 
-    runtime: Dict[int, "Runtime"] = {}
-
-    @staticmethod
-    def reset_all():
-        """
-        Resets all runtimes.
-        """
-        runtime_ids = list(_RuntimeState.runtime.keys())
-        for runtime_id in runtime_ids:
-            _RuntimeState.runtime[runtime_id].reset()
-
-    @staticmethod
-    def delete_all():
-        """
-        Deletes all runtimes.
-        """
-        runtime_ids = list(_RuntimeState.runtime.keys())
-        for runtime_id in runtime_ids:
-            _RuntimeState.runtime[runtime_id].reset(delete=True)
-
-    @staticmethod
-    def print_runtime_states():
-        """
-        Print runtimes and their corresponding states.
-        """
-        print(f"{'Runtime ID':<12} | {'Status':<20}")
-        print(f"{'-'*12} | {'-'*20}")
-        for runtime_id, runtime in _RuntimeState.runtime.items():
-            runtime_id = "-1(Default)" if runtime_id == -1 else runtime_id
-            print(f"{runtime_id:<12} | {runtime.state:<20}")
+    runtime = None
 
 
 class Executor(_Executor):
@@ -64,11 +35,10 @@ class State(Enum):
         LaunchedNotRunning = 1
         Running = 2
 
-    def __init__(self, runtime_id: int = -1):
-        self.runtime_id = runtime_id
+    def __init__(self):
         self.executor: Executor = None
         self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime[runtime_id] = self
+        _RuntimeState.runtime = self
 
     def get_state(self) -> "Runtime.State":
         """
@@ -77,36 +47,16 @@ def get_state(self) -> "Runtime.State":
         return self.state
 
     @staticmethod
-    def exists(runtime_id: int) -> bool:
-        """
-        Check if a runtime exists with the given ID.
+    def get_runtime() -> "Runtime":
         """
-        return runtime_id in _RuntimeState.runtime
-
-    @staticmethod
-    def get_all_ids() -> List[int]:
-        """
-        Get a list of all existing runtime IDs.
-        """
-        return list(_RuntimeState.runtime.keys())
-
-    @staticmethod
-    def get_runtime(runtime_id=-1) -> "Runtime":
-        """
-        Get the runtime by ID. If runtime_id is not provided, use a default ID of -1.
-        If the runtime does not exist, create a new runtime with the given ID.
-        """
-        if runtime_id not in _RuntimeState.runtime:
-            _RuntimeState.runtime[runtime_id] = Runtime(runtime_id)
-        return _RuntimeState.runtime[runtime_id]
-
-    @staticmethod
-    def see_runtime_statuses() -> "Dict[int, Runtime]":
-        """
-        Returns the runtime dictionary containing all of the runtimes.
+        Get the runtime.
+        If the runtime does not exist, create a new runtime.
         """
+        if _RuntimeState.runtime is None:
+            _RuntimeState.runtime = Runtime()
         return _RuntimeState.runtime
 
+
     def __enter__(self):
         return self
 
@@ -142,7 +92,7 @@ def launch(
         """
         if self.launched():
             logging.warning(
-                f"Runtime {self.runtime_id} is already launched, skip launching"
+                f"Runtime is already launched, skip launching"
             )
             return
         plan = Planner(device_id).plan() if plan is None else plan
@@ -152,7 +102,7 @@ def launch(
             if self.executor is not None:
                 if not self.executor.destroyed():
                     logging.warning(
-                        f"Runtime {self.runtime_id}, has already been launched. Destroying the old executor"
+                        f"Runtime has already been launched. Destroying the old executor"
                     )
                     self.executor.destroy()
             self.executor = Executor(
@@ -171,8 +121,8 @@ def run(self, iter=1, non_blocking=False):
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
         if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error(f"ARK runtime {self.runtime_id} is not launched")
-            raise RuntimeError(f"ARK runtime {self.runtime_id} is not launched")
+            logging.error(f"ARK runtime is not launched")
+            raise RuntimeError(f"ARK runtime is not launched")
         self.state = Runtime.State.Running
         self.executor.run(iter)
         if not non_blocking:
@@ -193,7 +143,7 @@ def wait(self):
         """
         if self.state != Runtime.State.Running:
             logging.warning(
-                f"ARK runtime {self.runtime_id} is not running, skip waiting"
+                f"ARK runtime is not running, skip waiting"
             )
             return
         self.executor.wait()
@@ -206,7 +156,7 @@ def stop(self) -> float:
         """
         if not self.launched():
             logging.warning(
-                f"ARK runtime {self.runtime_id} is never launched, skip stopping"
+                f"ARK runtime is never launched, skip stopping"
             )
             return
         elapsed = self.executor.stop()
@@ -215,7 +165,7 @@ def stop(self) -> float:
 
     def reset(self, delete=False):
         """
-        Reset the runtime. If delete is True, delete the runtime associated with the runtime_id.
+        Reset the runtime. If delete is True, delete the runtime.
         """
         if self.launched():
             self.stop()
@@ -225,25 +175,5 @@ def reset(self, delete=False):
             self.executor = None
         self.state = Runtime.State.Init
         if delete:
-            del _RuntimeState.runtime[self.runtime_id]
-
-    @staticmethod
-    def reset_all_runtimes():
-        """
-        Reset all runtimes.
-        """
-        _RuntimeState.reset_all()
-
-    @staticmethod
-    def delete_all_runtimes():
-        """
-        Delete all runtimes.
-        """
-        _RuntimeState.delete_all()
-
-    @staticmethod
-    def print_runtime_states():
-        """
-        Print runtimes and their corresponding states.
-        """
-        _RuntimeState.print_runtime_states()
+            del _RuntimeState.runtime
+            _RuntimeState.runtime = None
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 089d3eaed..a950c3d1d 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -33,18 +33,15 @@ def __init__(
         self,
         _tensor: _Tensor,
         initializer: Initializer = None,
-        runtime_id: int = -1,
     ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
             intializer (Initializer): The initializer for the Tensor.
-            runtime_id (int): The ID of the Runtime to use. Defaults to -1, which is the default Runtime.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
-        self.runtime_id = runtime_id
 
     def shape(self) -> List[int]:
         """
@@ -83,7 +80,7 @@ def to_numpy(
             raise ValueError(
                 f"Tensor data type {self.dtype().__name__} is not supported by numpy."
             )
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.to_numpy()` is "
@@ -106,7 +103,7 @@ def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -124,7 +121,7 @@ def to_dlpack(self):
         """
         Returns a DLPack tensor that shares the same memory with the device tensor.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
@@ -133,11 +130,11 @@ def to_dlpack(self):
         return rt.executor.tensor_to_dlpack(self._tensor)
 
     @staticmethod
-    def from_dlpack(ext_tensor, runtime_id: int = -1) -> "Tensor":
+    def from_dlpack(ext_tensor) -> "Tensor":
         """
         Copies the tensor from a DLPack tensor to the device.
         """
-        return Tensor(_Tensor(ext_tensor), runtime_id=runtime_id)
+        return Tensor(_Tensor(ext_tensor))
 
     def to_torch(self) -> torch.Tensor:
         """
@@ -152,7 +149,7 @@ def to_torch(self) -> torch.Tensor:
         return torch_view
 
     @staticmethod
-    def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
+    def from_torch(tensor: torch.Tensor) -> "Tensor":
         """
         Returns an ARK tensor that shares the same memory with the torch tensor.
         """
@@ -162,10 +159,7 @@ def from_torch(tensor: torch.Tensor, runtime_id: int = -1) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        return Tensor.from_dlpack(
-            torch.utils.dlpack.to_dlpack(tensor),
-            runtime_id=runtime_id,
-        )
+        return Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0
@@ -174,7 +168,7 @@ def copy(
         Copies data into this tensor. The data type may differ,
         but the size must match.
         """
-        rt = Runtime.get_runtime(self.runtime_id)
+        rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
                 "Tensor is not allocated yet. `Tensor.from_numpy()` is "
@@ -218,9 +212,8 @@ class Parameter(Tensor):
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor, runtime_id: int = -1):
+    def __init__(self, _tensor: _Tensor):
         """
         Initializes a new instance of the Parameter class.
         """
         super().__init__(_tensor)
-        self.runtime_id = runtime_id

From 9b265c7cc1d70c46919c389952cf37467e917632 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 21:38:14 +0000
Subject: [PATCH 058/106] Fix communication

---
 ark/api/executor.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 50686c434..86243f10d 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -405,20 +405,20 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             continue;
         } else {
             buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            for (const auto &tag_info : buf_info->buffer->send_tags()) {
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_send_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
+            for (const auto &tag_info : buf_info->buffer->recv_tags()) {
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .first.push_back(tag_info.second);
+                remote_rank_to_recv_tags_and_offsets[tag_info.first]
+                    .second.push_back(offset);
+            }
             offset += buf_info->bytes;
         }
-        for (const auto &tag_info : buf_info->buffer->send_tags()) {
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_send_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
-        }
-        for (const auto &tag_info : buf_info->buffer->recv_tags()) {
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .first.push_back(tag_info.second);
-            remote_rank_to_recv_tags_and_offsets[tag_info.first]
-                .second.push_back(offset);
-        }
     }
     total_bytes_ = offset;
 

From 5d5342a27cd6a5d761757a92a074c86c3e0e3a62 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 22:06:54 +0000
Subject: [PATCH 059/106] Add `Tensor::torch_strides` method

---
 ark/api/tensor.cpp         | 17 +++++++++++++++++
 ark/include/ark/tensor.hpp |  2 ++
 python/ark/ops.py          |  4 +---
 python/ark/runtime.py      | 13 +++----------
 python/executor_py.cpp     | 13 ++-----------
 python/tensor_py.cpp       | 15 ++++++---------
 6 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4b5bdfd55..4fb60aef6 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -64,6 +64,23 @@ const DataType &Tensor::data_type() const {
     return NONE;
 }
 
+Dims Tensor::torch_strides() const {
+    if (ref_) {
+        Dims st = ref_->strides();
+        int ndims = st.ndims();
+        std::vector<DimType> tmp;
+        for (int i = 1; i < ndims; ++i) {
+            tmp.push_back(st[i]);
+        }
+        tmp.push_back(1);
+        for (int i = ndims - 2; i >= 0; --i) {
+            tmp[i] *= tmp[i + 1];
+        }
+        return Dims(tmp);
+    }
+    return Dims();
+}
+
 std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index d13748175..5e463f99d 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -52,6 +52,8 @@ class Tensor {
     Dims padded_shape() const;
 
     const DataType &data_type() const;
+
+    Dims torch_strides() const;
 };
 
 const Tensor NullTensor;
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 5fe144150..f8b75a70b 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -534,9 +534,7 @@ def mean(
 
 
 def ones(
-    shape: Iterable[int],
-    dtype: DataType = fp32,
-    name: str = "ones"
+    shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
 ) -> Tensor:
     """Ones."""
     return Tensor(
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 671953df1..712addc29 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -56,7 +56,6 @@ def get_runtime() -> "Runtime":
             _RuntimeState.runtime = Runtime()
         return _RuntimeState.runtime
 
-
     def __enter__(self):
         return self
 
@@ -91,9 +90,7 @@ def launch(
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
         if self.launched():
-            logging.warning(
-                f"Runtime is already launched, skip launching"
-            )
+            logging.warning(f"Runtime is already launched, skip launching")
             return
         plan = Planner(device_id).plan() if plan is None else plan
         # If the RuntimeState is init, we need to create a new executor and
@@ -142,9 +139,7 @@ def wait(self):
         Wait for the kernel to finish.
         """
         if self.state != Runtime.State.Running:
-            logging.warning(
-                f"ARK runtime is not running, skip waiting"
-            )
+            logging.warning(f"ARK runtime is not running, skip waiting")
             return
         self.executor.wait()
         self.state = Runtime.State.LaunchedNotRunning
@@ -155,9 +150,7 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warning(
-                f"ARK runtime is never launched, skip stopping"
-            )
+            logging.warning(f"ARK runtime is never launched, skip stopping")
             return
         elapsed = self.executor.stop()
         self.state = Runtime.State.LaunchedNotRunning
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index d90825e21..f42e59ee9 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -112,19 +112,10 @@ SharedTensor::SharedTensor(Executor &exe, const Tensor &tensor) {
     device_id_ = exe.device_id();
     dtype_ = tensor.data_type();
     shape_ = std::make_shared<std::vector<int64_t>>(tensor.shape().vector());
+    strides_ =
+        std::make_shared<std::vector<int64_t>>(tensor.torch_strides().vector());
     offsets_ =
         std::make_shared<std::vector<int64_t>>(tensor.offsets().vector());
-
-    strides_ = std::make_shared<std::vector<int64_t>>();
-    if (!shape_->empty()) {
-        int ndims = static_cast<int>(shape_->size());
-        strides_->resize(shape_->size());
-        strides_->back() = 1;
-        auto tmp = tensor.strides().vector();
-        for (int i = ndims - 2; i >= 0; --i) {
-            (*strides_)[i] = (*strides_)[i + 1] * tmp[i + 1];
-        }
-    }
 }
 
 DLTensor SharedTensor::dl_tensor() const {
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index e7f06592d..5abb35c66 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -83,15 +83,12 @@ void register_tensor(py::module& m) {
             return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
         }))
         .def("id", &ark::Tensor::id)
-        .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
-        .def("strides", &ark::Tensor::strides,
-             py::return_value_policy::reference)
-        .def("offsets", &ark::Tensor::offsets,
-             py::return_value_policy::reference)
-        .def("padded_shape", &ark::Tensor::padded_shape,
-             py::return_value_policy::reference)
-        .def("data_type", &ark::Tensor::data_type,
-             py::return_value_policy::reference);
+        .def("shape", &ark::Tensor::shape)
+        .def("strides", &ark::Tensor::strides)
+        .def("offsets", &ark::Tensor::offsets)
+        .def("padded_shape", &ark::Tensor::padded_shape)
+        .def("data_type", &ark::Tensor::data_type)
+        .def("torch_strides", &ark::Tensor::torch_strides);
 
     m.attr("_NullTensor") = &ark::NullTensor;
 }

From 598cb78b351de2471e3d2386374504f4820adcd4 Mon Sep 17 00:00:00 2001
From: Noli Gerawork <ngerawor@andrew.cmu.edu>
Date: Sun, 11 Aug 2024 18:18:51 -0400
Subject: [PATCH 060/106] Torch Support (#237)

- Adds `RuntimeModule`.
- Adds support for running multiple consecutive plans.
- Pass buffers (externally allocated or from a previous plan), as kernel
arguments
- Adds gradient computation logic for ARK tensors/parameters

---------

Co-authored-by: Changho Hwang <changhohwang@microsoft.com>
---
 ark/api/executor.cpp                     | 114 +++++++++++++---
 ark/api/tensor.cpp                       |   7 +-
 ark/codegen.cpp                          | 101 +++++++++++---
 ark/codegen.hpp                          |   4 +-
 ark/include/ark/error.hpp                |   2 +-
 ark/include/ark/executor.hpp             |   5 +-
 ark/include/kernels/kernel_template.in   |  18 +--
 docs/env.md                              |   4 +
 examples/tutorial/model_test_tutorial.py | 163 +++++++++++++++++++++++
 examples/tutorial/torch_tutorial.py      |  27 ----
 python/ark/init.py                       |   4 +-
 python/ark/module.py                     | 144 +++++++++++---------
 python/ark/runtime.py                    |  21 ++-
 python/ark/tensor.py                     |  55 +++++++-
 python/ark/torch_mock.py                 |  20 +++
 python/executor_py.cpp                   |   3 +-
 python/unittest/test_runtime.py          | 154 ++++++++-------------
 17 files changed, 592 insertions(+), 254 deletions(-)
 create mode 100644 examples/tutorial/model_test_tutorial.py
 delete mode 100644 examples/tutorial/torch_tutorial.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 86243f10d..4634ed6fd 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -25,9 +25,9 @@
 #include "gpu/gpu_manager.hpp"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
-#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
+#include "model_buffer_manager.hpp"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -148,16 +148,17 @@ class Executor::Impl {
     Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
     ~Impl();
 
-    void init(const PlanJson& plan);
+    void init(const PlanJson &plan);
 
     int device_id() const { return device_id_; }
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
-    std::shared_ptr<GpuMemory> buffer() const { return buffer_; }
+    std::shared_ptr<GpuMemory> buffer() const { return buffers_.back(); }
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
+    void add_plan(const std::string &plan);
     void compile();
     void launch();
     void run(int iter);
@@ -165,7 +166,7 @@ class Executor::Impl {
     float stop(int64_t max_spin_count);
     void barrier();
 
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     void tensor_read(const Tensor &tensor, void *data, size_t bytes,
                      Stream stream, bool is_d2d) const;
@@ -175,6 +176,8 @@ class Executor::Impl {
    private:
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
+    std::map<size_t, void *> init_buffer_addrs(
+        void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset);
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
@@ -183,6 +186,8 @@ class Executor::Impl {
     std::string name_;
     bool loop_mode_;
 
+    bool is_buffer_allocated_;
+
     gpuStream stream_raw_;
 
     int rank_;
@@ -193,12 +198,16 @@ class Executor::Impl {
     float elapsed_msec_ = -1;
 
     PlanJson plan_json_;
+    std::vector<void *> external_buffers_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::map<size_t, void *> buffer_id_to_addr_;
     size_t total_bytes_;
     std::shared_ptr<CodeGenerator> codegen_;
     std::shared_ptr<GpuEvent> timer_begin_;
     std::shared_ptr<GpuEvent> timer_end_;
-    std::shared_ptr<GpuMemory> buffer_;
+    std::vector<std::shared_ptr<GpuMemory>> buffers_;
     std::shared_ptr<GpuHostMemory> flag_;
     std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
@@ -239,11 +248,12 @@ void Executor::Impl::init(const PlanJson &plan_json) {
         ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
             world_size_);
     }
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && !comm_) {
         init_communicator();
     }
 
     auto gpu_manager = GpuManager::get_instance(device_id_);
+
     if (!gpu_manager->info().arch->belongs_to(
             Arch::from_name(plan_json.at("Architecture")))) {
         LOG(WARN, "Architecture name of the plan `",
@@ -260,12 +270,20 @@ void Executor::Impl::init(const PlanJson &plan_json) {
             std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
     }
 
-    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
-                                               name_);
-
     timer_begin_ = gpu_manager->create_event();
     timer_end_ = gpu_manager->create_event();
-    buffer_ = gpu_manager->malloc(total_bytes_, 65536);
+    if (total_bytes_ > 0) {
+        buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
+        is_buffer_allocated_ = true;
+    }
+
+    buffer_id_to_addr_ =
+        init_buffer_addrs(buffers_.back()->ref(), buffer_id_to_offset_);
+
+    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
+                                               external_args_,
+                                               buffer_id_to_name_, name_);
+
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
 
@@ -282,6 +300,8 @@ void Executor::Impl::init(const PlanJson &plan_json) {
 
     std::string kernel_name;
     if (loop_mode_) {
+        // should we add an identifier to specify which plan the kernel executes
+        // i.e. ark_loop_kernel_2 for the second plan
         kernel_name = "ark_loop_kernel";
     } else {
         kernel_name = "ark_kernel";
@@ -304,6 +324,21 @@ void Executor::Impl::init_communicator() {
     comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
 }
 
+std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
+    void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset) {
+    std::map<size_t, void *> buffer_id_to_addr;
+    // Reuse existing buffer addresses for new plans that use previous tensors
+    // from earlier plans
+    if (!buffer_id_to_addr_.empty()) {
+        buffer_id_to_addr = buffer_id_to_addr_;
+    }
+    for (const auto &kv : buffer_id_to_offset) {
+        buffer_id_to_addr[kv.first] =
+            static_cast<char *>(buffer_base) + kv.second;
+    }
+    return buffer_id_to_addr;
+}
+
 std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     class BufferInfo {
        public:
@@ -402,6 +437,23 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
                 ERR(InvalidUsageError,
                     "PyTorch tensor and model execution are on different GPUs");
             }
+            external_buffers_.push_back(buf_info->buffer->external_data());
+            const auto [it, inserted] = buffer_id_to_name_.try_emplace(
+                buf_info->buffer->id(),
+                "extern_buf_" + std::to_string(buf_info->buffer->id()));
+            external_args_.push_back(it->second);
+            continue;
+        }
+        // if we are adding a plan and come across a buffer from a previous
+        // plan, we utilize the buffer offset from the previous plan
+        if (buffer_id_to_offset_.find(buf_info->buffer->id()) !=
+            buffer_id_to_offset_.end()) {
+            external_buffers_.push_back(
+                buffer_id_to_addr_[buf_info->buffer->id()]);
+            const std::string name =
+                "extern_buf_" + std::to_string(buf_info->buffer->id());
+            external_args_.push_back(name);
+            buffer_id_to_name_[buf_info->buffer->id()] = name;
             continue;
         } else {
             buffer_id_to_offset[buf_info->buffer->id()] = offset;
@@ -552,7 +604,9 @@ std::set<int> Executor::Impl::init_remote_ranks(const Json &plan_json) const {
 }
 
 void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
-    proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    if (!proxy_service_) {
+        proxy_service_ = std::make_shared<mscclpp::ProxyService>();
+    }
 
     int num_ranks_per_node = get_env().num_ranks_per_host;
     auto rank_to_node = [&](int rank) { return rank / num_ranks_per_node; };
@@ -569,8 +623,8 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     if (!get_env().disable_ib) {
         all_transports |= IBs[device_id_];
     }
-    mscclpp::RegisteredMemory regmem =
-        comm_->registerMemory(buffer_->ref(), buffer_->bytes(), all_transports);
+    mscclpp::RegisteredMemory regmem = comm_->registerMemory(
+        buffers_.back()->ref(), buffers_.back()->bytes(), all_transports);
 
     std::map<int, std::vector<mscclpp::NonblockingFuture<
                       std::shared_ptr<mscclpp::Connection>>>>
@@ -644,6 +698,15 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
+void Executor::Impl::add_plan(const std::string &plan) {
+    external_buffers_.clear();
+    external_args_.clear();
+    buffer_id_to_name_.clear();
+    total_bytes_ = 0;
+    is_buffer_allocated_ = false;
+    init(Json::parse(plan));
+}
+
 void Executor::Impl::compile() { kernel_->compile(); }
 
 void Executor::Impl::launch() {
@@ -708,9 +771,12 @@ void Executor::Impl::launch() {
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *buf_ptr = buffer_->ref();
         void *flag_ptr = flag_->ref();
+        void *buf_ptr = buffers_.back()->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
         kernel_->launch(stream_raw_, args);
     }
     is_recording_ = true;
@@ -724,9 +790,12 @@ void Executor::Impl::run(int iter) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = buffer_->ref();
+        void *buf_ptr = buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
+        for (auto &buffer : external_buffers_) {
+            args.push_back(&buffer);
+        }
         for (; i < iter; i++) {
             kernel_->launch(stream_raw_, args);
         }
@@ -793,13 +862,12 @@ void Executor::Impl::barrier() {
     }
 }
 
-uintptr_t Executor::Impl::tensor_address(const Tensor &tensor) const {
+void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
-    if (buffer_id_to_offset_.find(buffer_id) == buffer_id_to_offset_.end()) {
+    if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
         ERR(InternalError, "Invalid buffer ID: ", buffer_id);
     }
-    size_t offset = buffer_id_to_offset_.at(buffer_id);
-    return reinterpret_cast<uintptr_t>(buffer_->ref(offset));
+    return buffer_id_to_addr_.at(buffer_id);
 }
 
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
@@ -830,7 +898,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = reinterpret_cast<void *>(tensor_address(tensor));
+    void *src = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
@@ -888,7 +956,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = reinterpret_cast<void *>(tensor_address(tensor));
+    void *dst = tensor_address(tensor);
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
@@ -940,6 +1008,8 @@ std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
 
 std::string Executor::plan() const { return impl_->plan(); }
 
+void Executor::add_plan(const std::string &plan) { impl_->add_plan(plan); }
+
 void Executor::compile() { impl_->compile(); }
 
 void Executor::launch() { impl_->launch(); }
@@ -961,7 +1031,7 @@ void Executor::destroy() {
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
-uintptr_t Executor::tensor_address(const Tensor &tensor) const {
+void *Executor::tensor_address(const Tensor &tensor) const {
     return impl_->tensor_address(tensor);
 }
 
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 4fb60aef6..084ce6383 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -10,15 +10,14 @@
 namespace ark {
 
 Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape,
-               const DataType& dtype) {
+               const std::vector<int64_t>& shape, const DataType& dtype) {
     size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
                                                 std::multiplies<int64_t>()) *
                                 dtype.bytes();
     auto buffer =
         std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(dtype.ref(), buffer, Dims(shape),
-                                                Dims(shape), Dims(), Dims());
+    auto tensor = std::make_shared<ModelTensor>(
+        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
     ref_ = tensor;
 }
 
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 1619b863f..2bd36d679 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -10,10 +10,10 @@
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
-#include "model_buffer_manager.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
+#include "model_buffer_manager.hpp"
 #include "range.hpp"
 #include "utils/utils_math.hpp"
 
@@ -25,7 +25,18 @@ static std::string replace(
         size_t pos = 0;
         while ((pos = result.find(kv.first, pos)) != std::string::npos) {
             result.replace(pos, kv.first.length(), kv.second);
-            pos += kv.second.length();
+            if ((kv.first == "@GLOBAL_ARGS@" || kv.first == "@FUNCTION_ARGS@" ||
+                 kv.first == "@ARG_TYPES@") &&
+                kv.second.empty()) {
+                size_t comma_pos = pos;
+                if (comma_pos >= 2 && result.substr(comma_pos - 2, 2) == ", ") {
+                    result.erase(comma_pos - 2, 2);
+                    pos -= 2;
+                }
+
+            } else {
+                pos += kv.second.length();
+            }
         }
     }
     return result;
@@ -44,6 +55,8 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
+         const std::vector<std::string> &external_args,
+         const std::map<size_t, std::string> &buffer_id_to_name,
          const std::string &name);
     ~Impl() = default;
 
@@ -69,6 +82,8 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
+    std::vector<std::string> external_args_;
+    std::map<size_t, std::string> buffer_id_to_name_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -77,10 +92,15 @@ class CodeGenerator::Impl {
     std::string code_;
 };
 
-CodeGenerator::Impl::Impl(const PlanJson &plan,
-                          const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::string &name)
-    : buffer_id_to_offset_(buffer_id_to_offset), name_(name) {
+CodeGenerator::Impl::Impl(
+    const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
+    const std::string &name)
+    : buffer_id_to_offset_(buffer_id_to_offset),
+      external_args_(external_args),
+      buffer_id_to_name_(buffer_id_to_name),
+      name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
     num_procs_ = plan.at("NumProcessors");
@@ -169,6 +189,30 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     if (!is_file(template_path)) {
         ERR(InternalError, "kernel template file not found: ", template_path);
     }
+
+    // Generate the global arguments
+    std::stringstream global_args_ss, function_args_ss, arg_types_ss;
+    for (const auto &arg : external_args_) {
+        global_args_ss << "void *" << arg << ", ";
+        function_args_ss << arg << ", ";
+        arg_types_ss << "void *, ";
+    }
+    std::string global_args = global_args_ss.str();
+    std::string function_args = function_args_ss.str();
+    std::string arg_types = arg_types_ss.str();
+    if (!global_args.empty()) {
+        global_args.pop_back();
+        global_args.pop_back();
+    }
+    if (!function_args.empty()) {
+        function_args.pop_back();
+        function_args.pop_back();
+    }
+    if (!arg_types.empty()) {
+        arg_types.pop_back();
+        arg_types.pop_back();
+    }
+
     std::string template_code = read_file(template_path);
     std::map<std::string, std::string> replacements = {
         {"@NUM_BLOCKS@", std::to_string(num_procs_)},
@@ -176,6 +220,9 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
         {"@NAME@", (name_.empty() ? "" : "_" + name_)},
+        {"@GLOBAL_ARGS@", global_args},
+        {"@FUNCTION_ARGS@", function_args},
+        {"@ARG_TYPES@", arg_types},
     };
     code_ = replace(template_code, replacements);
 }
@@ -215,7 +262,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
         ss << this->def_op(op_json, task_json["Id"], op_idx++);
     }
     ss << "__device__ void t" << task_json["Id"]
-       << "(char* _buf, int _idx, int _spw) {\n";
+       << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {
         auto op = ModelOp::deserialize(op_json);
@@ -225,25 +272,32 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             auto &arg = impl_args[i];
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
-                if (tns->buffer()->is_external()) {
-                    void *buf_addr =
-                        ModelBufferManager::get_instance().get_buffer(
-                            tns->buffer()->id());
-                    ss << "(" << tns->data_type()->type_str() << "*)"
-                       << buf_addr;
-                } else {
-                    size_t buffer_offset =
-                        buffer_id_to_offset_.at(tns->buffer()->id());
+                size_t buffer_id = tns->buffer()->id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + ModelOffset(tns).value();
                     ss << "(" << tns->data_type()->type_str() << "*)&_buf["
                        << offset << "]";
+                } else {
+                    ss << "(" << tns->data_type()->type_str() << "*)"
+                       << buffer_id_to_name_.at(buffer_id);
                 }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
-                size_t buffer_offset =
-                    buffer_id_to_offset_.at(moff.buffer_id());
-                size_t offset = buffer_offset + moff.value();
-                ss << offset;
+                size_t buffer_id = moff.buffer_id();
+                if (buffer_id_to_name_.find(buffer_id) ==
+                    buffer_id_to_name_.end()) {
+                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                    size_t offset = buffer_offset + moff.value();
+                    ss << offset;
+                } else {
+                    const std::string &buffer_name =
+                        buffer_id_to_name_.at(buffer_id);
+                    size_t offset = moff.value();
+                    ss << "(uint64_t)((char*)" << buffer_name << " + " << offset
+                       << ")";
+                }
             } else {
                 ss << arg.serialize().begin().value();
             }
@@ -274,7 +328,7 @@ std::string CodeGenerator::Impl::task_seq(
     ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", "
        << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", "
        << task_gran << ", " << num_slots << ", " << slot_num_warps << ", "
-       << slot_sram_bytes << ", t" << task_id << ">(_buf);\n";
+       << slot_sram_bytes << ", t" << task_id << ">(_buf, @FUNCTION_ARGS@);\n";
     return ss.str();
 }
 
@@ -444,8 +498,11 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
+    const std::vector<std::string> &external_args,
+    const std::map<size_t, std::string> &buffer_id_to_name,
     const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, name)) {}
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, external_args,
+                                   buffer_id_to_name, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 1ed8ec9f2..8a4eed270 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -8,8 +8,8 @@
 #include <memory>
 #include <string>
 
-#include "model_buffer_manager.hpp"
 #include "model/model_json.hpp"
+#include "model_buffer_manager.hpp"
 
 namespace ark {
 
@@ -17,6 +17,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
+                  const std::vector<std::string> &external_args,
+                  const std::map<size_t, std::string> &buffer_id_to_name,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 965b1c0bc..1fbec0c01 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -44,4 +44,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP
+#endif  // ARK_ERROR_HPP
\ No newline at end of file
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 02a67cd26..d44ac2302 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -39,6 +39,9 @@ class Executor {
     /// Return the plan string.
     std::string plan() const;
 
+    /// Add a plan to the executor.
+    void add_plan(const std::string &plan);
+
     /// Compile the model. This must be called before `launch()`.
     void compile();
 
@@ -67,7 +70,7 @@ class Executor {
     bool destroyed() const;
 
     /// Return the raw virtual address of the tensor.
-    uintptr_t tensor_address(const Tensor &tensor) const;
+    void *tensor_address(const Tensor &tensor) const;
 
     template <typename T>
     void tensor_read(const Tensor &tensor, std::vector<T> &data,
diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index a8a56f141..a05e143d3 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -6,8 +6,8 @@ using namespace ark;
 template <size_t ProcBegin, size_t ProcEnd, size_t ProcStep, size_t ProcCurrent,
           size_t TaskBegin, size_t TaskEnd, size_t TaskStep, size_t TaskGranularity,
           size_t NumSlots, size_t SlotNumWarps, size_t SlotSramBytes,
-          void (*task)(char*, int, int)>
-__forceinline__ __device__ void task_seq(char *_buf) {
+          void (*task)(char*, int, int, @ARG_TYPES@)>
+__forceinline__ __device__ void task_seq(char *_buf, @GLOBAL_ARGS@) {
   if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
       ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
     constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
@@ -23,7 +23,7 @@ __forceinline__ __device__ void task_seq(char *_buf) {
       size_t task_id = task_id_base + TaskStep *
         (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
       if (task_id >= TaskEnd) break;
-      task(_buf, task_id, SramBytesPerWarp);
+      task(_buf, task_id, SramBytesPerWarp, @FUNCTION_ARGS@);
     }
   }
 }
@@ -33,12 +33,12 @@ __device__ sync::State ARK_LOOP_SYNC_STATE;
 
 @DEFINITIONS@
 
-__device__ void ark_body(char *_buf, int _iter) {
+__device__ void ark_body(char *_buf, int _iter, @GLOBAL_ARGS@) {
 @BODY@
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
+void ark_loop_kernel@NAME@(char *_buf, int *_iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
@@ -52,10 +52,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
     sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
     if (ARK_ITER < 0) return;
 
-    ark_body(_buf, 0);
+    ark_body(_buf, 0, @FUNCTION_ARGS@);
     for (int _i = 1; _i < ARK_ITER; ++_i) {
       sync_gpu<@NUM_BLOCKS@>(ARK_LOOP_SYNC_STATE);
-      ark_body(_buf, _i);
+      ark_body(_buf, _i, @FUNCTION_ARGS@);
     }
     if (threadIdx.x == 0) {
       __threadfence_system();
@@ -69,10 +69,10 @@ void ark_loop_kernel@NAME@(char *_buf, int *_iter) {
 }
 
 extern "C" __global__ __launch_bounds__(ARK_WARPS_PER_BLOCK * Arch::ThreadsPerWarp, 1)
-void ark_kernel@NAME@(char *_buf, int _iter) {
+void ark_kernel@NAME@(char *_buf, int _iter, @GLOBAL_ARGS@) {
   int *shared_mem = (int *)_ARK_SMEM;
   for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {
     shared_mem[i] = 0;
   }
-  ark_body(_buf, _iter);
+  ark_body(_buf, _iter, @FUNCTION_ARGS@);
 }
diff --git a/docs/env.md b/docs/env.md
index 2d5839c3b..95330a032 100644
--- a/docs/env.md
+++ b/docs/env.md
@@ -27,3 +27,7 @@
 - `ARK_DISABLE_IB` (Default: `0`; Options: `0`, `1`)
 
     If set to `1`, disable ibverbs networking (i.e., disable multi-node execution).
+
+- `ARK_IGNORE_BINARY_CACHE` (Default: `1`; Options: `0`, `1`)
+
+    If set to `1`, ignore the binary cache and force ARK to recompile binaries on each run.
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
new file mode 100644
index 000000000..ac5a7b2a9
--- /dev/null
+++ b/examples/tutorial/model_test_tutorial.py
@@ -0,0 +1,163 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import ark
+import torch
+import torch.optim as optim
+
+
+# Set random seed for reproducibility.
+torch.manual_seed(42)
+
+# Let's first define a linear layer using ARK.
+class ARKLinear(ark.Module):
+    def __init__(self, weight):
+        super().__init__()
+        self.weight = weight
+
+    def forward(self, input):
+        self.saved_input = input
+        output = ark.matmul(input, self.weight, transpose_other=True)
+        return output
+
+    def backward(self, grad_output):
+        grad_weight = ark.matmul(
+            grad_output, self.saved_input, transpose_input=True
+        )
+        grad_input = ark.matmul(grad_output, self.weight, transpose_other=False)
+        self.weight.update_gradient(grad_weight)
+        return grad_input, grad_weight
+
+
+# Let's use our previous module to define a double linear layer.
+class MyARKModule(ark.Module):
+    def __init__(self, weight0, weight1):
+        super().__init__()
+        self.linear1 = ARKLinear(weight0)
+        self.linear2 = ARKLinear(weight1)
+
+    def forward(self, x):
+        x = self.linear1.forward(x)
+        x = self.linear2.forward(x)
+        return x
+
+    def backward(self, grad_output):
+        grad_x, grad_weight2 = self.linear2.backward(grad_output)
+        grad_x, grad_weight1 = self.linear1.backward(grad_x)
+        return grad_x, grad_weight1, grad_weight2
+
+
+# Define a PyTorch model.
+class SimpleModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = torch.nn.Sequential(
+            torch.nn.Linear(256, 256, bias=False),  # Layer 0
+            torch.nn.Linear(256, 256, bias=False),  # Layer 1
+            torch.nn.Linear(256, 256, bias=False),  # Layer 2
+            torch.nn.Linear(256, 256, bias=False),  # Layer 3
+            torch.nn.Linear(256, 256, bias=False),  # Layer 4
+            torch.nn.ReLU(),  # Activation
+        )
+
+    def forward(self, x):
+        return self.layers(x)
+
+
+# Function to compare the gradients of two models of the same architecture and parameter order.
+def compare_grad(ark_model, torch_model, atol=1e-4, rtol=1e-2):
+    ark_params = list(ark_model.named_parameters())
+    torch_params = list(torch_model.named_parameters())
+    for (ark_name, ark_param), (torch_name, torch_param) in zip(
+        ark_params, torch_params
+    ):
+        if (ark_param.grad is None) ^ (torch_param.grad is None):
+            print("Exactly one of the gradients is None")
+        else:
+            grads_equal = torch.allclose(
+                ark_param.grad, torch_param.grad, atol=atol, rtol=rtol
+            )
+            if not grads_equal:
+                print(
+                    f"Gradient for {ark_name} when compared to {torch_name} is different:"
+                )
+                print(f"ARK gradient: {ark_param.grad}")
+                print(f"Torch gradient: {torch_param.grad}")
+
+
+# For our ARK model we will replace the first two layers with ARK layers.
+def replace_layers_with_ark(model):
+    weight_0 = torch.nn.Parameter(
+        model.layers[0].weight.to("cuda:0").requires_grad_(True)
+    )
+    weight_1 = torch.nn.Parameter(
+        model.layers[1].weight.to("cuda:0").requires_grad_(True)
+    )
+    ark_module = ark.RuntimeModule(MyARKModule(weight_0, weight_1))
+    model.layers[0] = ark_module
+    del model.layers[1]
+
+    # Since we replaced the PyTorch layer with an ARK layer, we need to register the PyTorch parameters
+    # our ARK module utilizes with the original PyTorch model so ARK can leverage PyTorch's optimizers.
+    model.register_parameter("weight_0", weight_0)
+    model.register_parameter("weight_1", weight_1)
+
+    return model
+
+
+# Instantiate our models.
+pytorch_model = SimpleModel()
+ark_model = SimpleModel()
+
+
+# Ensure both models have the same weights.
+ark_model.load_state_dict(pytorch_model.state_dict())
+ark_model = replace_layers_with_ark(ark_model)
+
+
+# Move both models to GPU.
+pytorch_model.to("cuda:0")
+ark_model.to("cuda:0")
+
+# Now let's run the models on some random input.
+input_torch = torch.randn(128, 256).to("cuda:0").requires_grad_(True)
+input_ark = input_torch.clone().detach().requires_grad_(True)
+
+
+# Define an arbitrary target.
+target = torch.randn(128, 256).to("cuda:0")
+
+loss_fn = torch.nn.MSELoss()
+optim_torch = optim.SGD(pytorch_model.parameters(), lr=0.01)
+optim_ark = optim.SGD(ark_model.parameters(), lr=0.01)
+
+num_iters = 5
+for iter in range(num_iters):
+    print(f"Iteration {iter+1}/{num_iters}")
+
+    optim_torch.zero_grad()
+    optim_ark.zero_grad()
+
+    pytorch_output = pytorch_model(input_torch)
+    ark_output = ark_model(input_ark)
+
+    assert torch.allclose(pytorch_output, ark_output, atol=1e-4, rtol=1e-2)
+
+    # Compute losses.
+    torch_loss = loss_fn(pytorch_output, target)
+    ark_loss = loss_fn(ark_output, target)
+
+    # See how ARK's loss compares to PyTorch's loss.
+    print(f"\nPyTorch loss: {torch_loss.item()}")
+    print(f"\nARK loss: {ark_loss.item()}\n")
+    assert torch.allclose(torch_loss, ark_loss, atol=1e-4, rtol=1e-2)
+
+    # Perform a backward pass.
+    torch_loss.backward()
+    ark_loss.backward()
+
+    optim_torch.step()
+    optim_ark.step()
+
+    # Ensure gradients of both models are updated accordingly.
+    compare_grad(ark_model, pytorch_model)
diff --git a/examples/tutorial/torch_tutorial.py b/examples/tutorial/torch_tutorial.py
deleted file mode 100644
index e9482a7cc..000000000
--- a/examples/tutorial/torch_tutorial.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import torch
-
-
-class ArkAddModule(ark.RuntimeModule):
-    def build_forward(self, x: ark.Tensor, y: ark.Tensor) -> ark.Tensor:
-        return ark.add(x, y)
-
-
-# ARK module for addition
-module = ArkAddModule()
-
-# Define two torch arrays
-x = torch.ones(64) * 2
-y = torch.ones(64) * 3
-
-# Run the ARK module
-z = module(x, y)
-
-w = module(x, z)
-
-# Print the result
-print(z)  # 5
-print(w)  # 7
diff --git a/python/ark/init.py b/python/ark/init.py
index a4a67e85d..29627d645 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -6,10 +6,10 @@
 from .runtime import _RuntimeState
 
 
-def init():
+def init(keep_runtime: bool = False):
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.runtime is not None:
+    if not keep_runtime and _RuntimeState.runtime is not None:
         del _RuntimeState.runtime
         _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/module.py b/python/ark/module.py
index d797da72c..0fdea23b6 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,9 +3,10 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict, List, Union
+from typing import Any, Dict, Union
 from .tensor import Tensor, Parameter
 from .runtime import Runtime, Planner
+from .init import init
 from .ops import tensor
 from .data_type import DataType
 
@@ -25,6 +26,7 @@ class Module:
     """
 
     def __init__(self):
+        super().__init__()
         # The submodules of the module.
         self.sub_modules: dict[str, "Module"] = dict()
         # The parameters of the module.
@@ -34,12 +36,16 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         """
         When setting an attribute, if the attribute is a Module, add it to
         the sub_modules. If the attribute is a Tensor and this Tensor is a
-        parameter, add it to the parameters.
+        parameter, add it to the parameters. If the attribute is a
+        torch.nn.Parameter, convert it to an ARK Parameter before adding.
         """
         if isinstance(__value, Module):
             self.register_module(__name, __value)
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
+        elif not _no_torch and isinstance(__value, torch.nn.Parameter):
+            __value = Parameter(__value)
+            self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
     def __call__(self, *args: Any, **kwargs: Any):
@@ -131,63 +137,81 @@ def _recursive_ark_to_torch(object):
     return object
 
 
-class RuntimeModule(Module):
-    def __init__(self):
-        if _no_torch:
-            raise ImportError("torch is not available")
-        super().__init__()
-        self.built_forward = False
-        self.built_backward = False
-        self.forward_input_tensor_args: List[Tensor] = []
-        self.forward_input_tensor_kwargs: Dict[str, Tensor] = {}
-        self.forward_input_args = []
-        self.forward_input_kwargs = {}
-        self.forward_output = None
-        self.backward_tensor_args = []
-        self.backward_tensor_kwargs = {}
-
-    def build_forward(self, *args: Any, **kwargs: Any) -> Any: ...
-
-    def build_backward(self, *args: Any, **kwargs: Any) -> Any: ...
-
-    def forward(self, *args: Any, **kwargs: Any) -> Any:
-        if not self.built_forward:
-            for arg in args:
-                if isinstance(arg, torch.Tensor):
-                    self.forward_input_tensor_args.append(
-                        tensor(
-                            list(arg.shape),
-                            DataType.from_torch(arg.dtype),
-                        )
-                    )
-                    self.forward_input_args.append(
-                        self.forward_input_tensor_args[-1]
-                    )
-                else:
-                    self.forward_input_args.append(arg)
-            for key, value in kwargs.items():
-                if isinstance(value, torch.Tensor):
-                    self.forward_input_tensor_kwargs[key] = tensor(
-                        list(value.shape),
-                        DataType.from_torch(value.dtype),
-                    )
-                    self.forward_input_kwargs[key] = (
-                        self.forward_input_tensor_kwargs[key]
-                    )
-                else:
-                    self.forward_input_kwargs[key] = value
-            self.forward_output = self.build_forward(
-                *self.forward_input_args,
-                **self.forward_input_kwargs,
-            )
-            self.built_forward = True
+class _ARKFunction(torch.autograd.Function):
+    """
+    Facilitates the integration of ARK modules with PyTorch's
+    autograd system by defining custom forward and backward passes that
+    utilize the user's defined ARK module.
+    """
 
-        with Runtime.get_runtime() as rt:
-            rt.launch(plan=Planner().plan())
-            for tns, arg in zip(self.forward_input_tensor_args, args):
-                tns.copy(arg)
-            for key, value in self.forward_input_tensor_kwargs.items():
-                value.copy(kwargs[key])
+    @staticmethod
+    def forward(ctx, ark_module, *args, **kwargs):
+        """
+        Returns a PyTorch tensor that is the result
+        of the forward pass of the ARK module.
+        """
+        init(keep_runtime=True)
+        ctx.ark_module = ark_module
+        input_args, input_kwargs = [], {}
+        input_requires_grad = 0
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                input_args.append(Tensor.from_torch(arg))
+                if arg.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_args.append(arg)
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                input_kwargs[k] = Tensor.from_torch(v)
+                if v.requires_grad:
+                    input_requires_grad += 1
+            else:
+                input_kwargs[k] = v
+        ctx.num_inp_grad = input_requires_grad
+        output = ark_module.forward(*input_args, **input_kwargs)
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        output = output.get_torch_view()
+        rt.reset(persist=True)
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_outputs):
+        """
+        Converts the gradient outputs to ARK format, computes the gradients for the input
+        and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
+        PyTorch parameters.
+        """
+        init(keep_runtime=True)
+        ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
+        grads = ctx.ark_module.backward(*ark_grad_outputs)
+        grad_inputs, grad_weights = (
+            grads[:ctx.num_inp_grad],
+            grads[ctx.num_inp_grad:],
+        )
+        params_dict = ctx.ark_module.params_dict()
+        rt = Runtime.get_runtime()
+        rt.launch()
+        rt.run()
+        grad_inputs = [grad.get_torch_view() for grad in grad_inputs]
+        for _, param in params_dict.items():
+            if param.staged_tensor is not None:
+                pytorch_grad = param.staged_tensor.get_torch_view()
+                param.torch_param.grad = pytorch_grad
+        rt.reset(persist=True)
+        return (None, *grad_inputs)
+
+
+class RuntimeModule(torch.nn.Module):
+    """
+    Wraps an ARK module to be used as a PyTorch autograd function.
+    """
+
+    def __init__(self, ark_module):
+        super().__init__()
+        self.ark_module = ark_module
 
-            rt.run()
-            return _recursive_ark_to_torch(self.forward_output)
+    def forward(self, *args, **kwargs):
+        return _ARKFunction.apply(self.ark_module, *args, **kwargs)
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 712addc29..071eedd04 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -77,6 +77,14 @@ def running(self) -> bool:
         """
         return self.state == Runtime.State.Running
 
+    def add_plan(self, plan: Plan):
+        """
+        Add a plan to the executor.
+        """
+        if self.executor is None:
+            raise RuntimeError("Executor is not initialized")
+        self.executor.add_plan(str(plan))
+
     def launch(
         self,
         plan: Plan = None,
@@ -89,10 +97,15 @@ def launch(
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
+        plan = Planner(device_id).plan() if plan is None else plan
         if self.launched():
-            logging.warning(f"Runtime is already launched, skip launching")
+            # If the Runtime state is already launched and we are adding another plan
+            # to the executor, we compile the new kernel and launch the executor again.
+            self.executor.add_plan(str(plan))
+            self.executor.compile()
+            self.executor.launch()
             return
-        plan = Planner(device_id).plan() if plan is None else plan
+
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
@@ -156,12 +169,14 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, delete=False):
+    def reset(self, delete=False, persist=False):
         """
         Reset the runtime. If delete is True, delete the runtime.
         """
         if self.launched():
             self.stop()
+        if persist:
+            return
         if self.executor is not None:
             if not self.executor.destroyed():
                 self.executor.destroy()
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index a950c3d1d..ba1af52db 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -33,15 +33,18 @@ def __init__(
         self,
         _tensor: _Tensor,
         initializer: Initializer = None,
+        requires_grad: bool = False,
     ):
         """
         Initializes a new instance of the Tensor class.
         Args:
             _tensor (_ark_core._Tensor): The underlying _Tensor object.
-            intializer (Initializer): The initializer for the Tensor.
+            initializer (Initializer): The initializer for the Tensor.
+            requires_grad (bool): Whether the tensor requires gradient. Defaults to True.
         """
         self._tensor = _tensor
         self.initializer: Initializer = initializer
+        self.requires_grad = requires_grad
 
     def shape(self) -> List[int]:
         """
@@ -171,7 +174,7 @@ def copy(
         rt = Runtime.get_runtime()
         if not rt.launched():
             raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
+                "Tensor is not allocated yet. `Tensor.copy()` is "
                 "usable only after you call `Runtime.launch()`."
             )
         tensor_bytes = self.nelems() * self.dtype().element_size()
@@ -187,6 +190,9 @@ def copy(
                 stream,
                 data.device.type == "cuda",
             )
+            data.requires_grad = self.requires_grad
+            if isinstance(self, Parameter):
+                self.torch_param = data
         elif isinstance(data, np.ndarray):
             if not data.flags["C_CONTIGUOUS"]:
                 data = np.ascontiguousarray(data)
@@ -207,13 +213,50 @@ def initialize(self) -> "Tensor":
         return self
 
 
-class Parameter(Tensor):
+class Parameter(Tensor, torch.nn.Parameter):
     """
     A tensor as a parameter.
     """
-
-    def __init__(self, _tensor: _Tensor):
+    def __init__(
+        self, tensor: Union[_Tensor, "torch.nn.Parameter"],
+    ):
         """
         Initializes a new instance of the Parameter class.
         """
-        super().__init__(_tensor)
+        if not _no_torch and isinstance(tensor, torch.nn.Parameter):
+            ark_tensor = Tensor.from_torch(tensor)
+            core_tensor = ark_tensor._tensor
+            self.torch_param = tensor
+            self.staged_tensor = None
+            Tensor.__init__(
+                self,
+                core_tensor,
+                requires_grad=tensor.requires_grad,
+            )
+        elif isinstance(tensor, _Tensor):
+            core_tensor = tensor
+            self.torch_param = None
+            self.staged_tensor = None
+            Tensor.__init__(
+                self, core_tensor, requires_grad=False
+            )
+        else:
+            raise TypeError(
+                "tensor must be an ARK tensor or a torch.nn.Parameter"
+            )
+
+    def update_gradient(self, ark_tensor: Tensor):
+        """
+        Stages an ARK tensor to be used for updating the gradient of its associated parameter.
+        """
+        if _no_torch:
+            raise ImportError("torch is not available")
+        if self.torch_param is None:
+            raise ValueError(
+                "there is no PyTorch parameter associated with this ARK parameter"
+            )
+        if not self.torch_param.requires_grad:
+            raise ValueError("parameter does not require gradient updates")
+        if ark_tensor is None or not isinstance(ark_tensor, Tensor):
+            raise ValueError("cannot use non-ARK tensor to update ARK gradient")
+        self.staged_tensor = ark_tensor
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index 68333e431..d0894a621 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -27,3 +27,23 @@ class ubyte: ...
 
 
 class Tensor: ...
+
+
+
+class nn:
+
+
+    class Module: ...
+    
+
+    class Parameter: ... 
+
+
+class autograd:
+
+
+    class Function: 
+
+
+        def apply(self, *args, **kwargs): ...
+
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index f42e59ee9..4b67b48a0 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -202,5 +202,6 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("tensor_to_dlpack", &tensor_to_dlpack);
+        .def("tensor_to_dlpack", &tensor_to_dlpack)
+        .def("add_plan", &ark::Executor::add_plan, py::arg("plan"));
 }
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index c3d15d1b9..b4fa838a1 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import ark
+import numpy as np
 
 
 empty_plan = ark.Plan(None)
@@ -20,99 +21,62 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-# def test_multiple_runtime_launch():
-#     ark.init()
-#     num_runtimes = 5
-#     for i in range(num_runtimes):
-#         rt = ark.Runtime.get_runtime(i)
-#         assert rt.launched() == False
-#         rt.launch(plan=empty_plan, device_id=i)
-#         assert rt.launched() == True
-#     for i in range(num_runtimes):
-#         rt = ark.Runtime.get_runtime(i)
-#         assert rt.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_stop_runtime():
-#     ark.init()
-#     rt1 = ark.Runtime.get_runtime(1)
-#     rt1.launch(plan=empty_plan, device_id=1)
-#     rt2 = ark.Runtime.get_runtime(2)
-#     rt2.launch(plan=empty_plan, device_id=2)
-#     rt1.stop()
-#     rt1.reset()
-#     assert rt1.state == ark.Runtime.State.Init
-#     assert rt2.state == ark.Runtime.State.LaunchedNotRunning
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_reset_runtime():
-#     ark.init()
-#     rt1 = ark.Runtime.get_runtime(0)
-#     rt1.launch(plan=empty_plan, device_id=1)
-#     rt2 = ark.Runtime.get_runtime(1)
-#     rt2.launch(plan=empty_plan, device_id=2)
-#     rt1.reset()
-#     assert rt1.launched() == False
-#     assert rt2.launched() == True
-#     rt1.launch(plan=empty_plan)
-#     assert rt1.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_multiple_runtimes_complex():
-#     ark.init()
-#     num_runtimes = 3
-#     runtime_list = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-#     default_runtime = ark.Runtime.get_runtime()
-#     runtime_list.append(default_runtime)
-#     for i, rt in enumerate(runtime_list):
-#         rt.launch(plan=empty_plan, device_id=i)
-#         assert rt.launched() == True
-#     runtime_list[0].stop()
-#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-#     for rt in runtime_list[1:]:
-#         assert rt.launched() == True
-#     runtime_list[1].reset()
-#     assert runtime_list[1].state == ark.Runtime.State.Init
-#     assert runtime_list[0].state == ark.Runtime.State.LaunchedNotRunning
-#     assert runtime_list[2].state == ark.Runtime.State.LaunchedNotRunning
-#     runtime_list[1].launch(plan=empty_plan, device_id=1)
-#     for rt in runtime_list:
-#         assert rt.launched() == True
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_runtime_state_after_reset():
-#     ark.init()
-#     rt = ark.Runtime.get_runtime()
-#     rt.launch(plan=empty_plan)
-#     rt.reset()
-#     assert rt.launched() == False
-#     assert rt.running() == False
-#     ark.Runtime.delete_all_runtimes()
-
-
-# def test_see_runtime_statuses():
-#     ark.init()
-#     num_runtimes = 3
-#     runtimes = [ark.Runtime.get_runtime(i) for i in range(num_runtimes)]
-#     runtime_statuses = ark.Runtime.see_runtime_statuses()
-#     assert len(runtime_statuses) == num_runtimes
-#     for i in range(num_runtimes):
-#         assert i in runtime_statuses
-#     for i, rt in enumerate(runtimes):
-#         assert runtime_statuses[i] == rt
-#     ark.Runtime.delete_all_runtimes()
-
+def test_add_plans():
+    ark.init()
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset(persist=True)
+    ark.init(keep_runtime=True)
+    prev_output = output_tensor
+    new_tensor = ark.tensor([M, N], ark.fp16)
+    final_output = ark.add(prev_output, new_tensor)
+    runtime.launch()
+    new_tensor_host = np.random.rand(M, N).astype(np.float16)
+    new_tensor.from_numpy(new_tensor_host)
+    runtime.run()
+    final_output_host = final_output.to_numpy()
+    np.testing.assert_allclose(
+        final_output_host, output_tensor_host + new_tensor_host
+    )
+    runtime.reset()
+
+def test_reuse_plans():
+    ark.init()
+    M, N = 64, 64
+    input_tensor = ark.tensor([M, N], ark.fp16)
+    other_tensor = ark.tensor([M, N], ark.fp16)
+    output_tensor = ark.add(input_tensor, other_tensor)
+    runtime = ark.Runtime()
+    runtime.launch()
+    input_tensor_host = np.random.rand(M, N).astype(np.float16)
+    input_tensor.from_numpy(input_tensor_host)
+    other_tensor_host = np.random.rand(M, N).astype(np.float16)
+    other_tensor.from_numpy(other_tensor_host)
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset(persist=True)
+    ark.init(keep_runtime=True)
+    runtime.launch()
+    runtime.run()
+    output_tensor_host = output_tensor.to_numpy()
+    np.testing.assert_allclose(
+        output_tensor_host, input_tensor_host + other_tensor_host
+    )
+    runtime.reset()
 
-# def test_multiple_runtimes_init():
-#     ark.init()
-#     runtimes = [ark.Runtime.get_runtime(i) for i in range(3)]
-#     for rt in runtimes:
-#         assert rt.state == ark.Runtime.State.Init
-#     ark.init()
-#     runtimes = ark.Runtime.see_runtime_statuses()
-#     assert len(runtimes) == 0
-#     ark.Runtime.delete_all_runtimes()

From 28ce0275d79264bf6e11cd855a49d87c1ee782cf Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 22:42:46 +0000
Subject: [PATCH 061/106] Minor changes

---
 ark/api/executor_test.cpp                |  2 +-
 examples/tutorial/model_test_tutorial.py |  1 +
 python/ark/init.py                       |  4 +--
 python/ark/module.py                     | 41 +++++++++++-------------
 python/ark/runtime.py                    |  8 ++---
 python/ark/tensor.py                     |  8 ++---
 python/ark/torch_mock.py                 | 10 ++----
 python/executor_py.cpp                   |  9 ++++--
 python/unittest/test_runtime.py          |  9 ++----
 9 files changed, 40 insertions(+), 52 deletions(-)

diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index dad0e9d83..75d506ecb 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -88,7 +88,7 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     ark::DefaultExecutor executor(m, 0);
     executor.compile();
     executor.launch();
-    UNITTEST_GT(executor.tensor_address(tensor), 0);
+    UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
     // Copy data from CPU array to ARK tensor
     executor.tensor_write(tensor, host_data.data(),
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
index ac5a7b2a9..c83d0d15e 100644
--- a/examples/tutorial/model_test_tutorial.py
+++ b/examples/tutorial/model_test_tutorial.py
@@ -9,6 +9,7 @@
 # Set random seed for reproducibility.
 torch.manual_seed(42)
 
+
 # Let's first define a linear layer using ARK.
 class ARKLinear(ark.Module):
     def __init__(self, weight):
diff --git a/python/ark/init.py b/python/ark/init.py
index 29627d645..a4a67e85d 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -6,10 +6,10 @@
 from .runtime import _RuntimeState
 
 
-def init(keep_runtime: bool = False):
+def init():
     """Initializes ARK."""
     Model.reset()
-    if not keep_runtime and _RuntimeState.runtime is not None:
+    if _RuntimeState.runtime is not None:
         del _RuntimeState.runtime
         _RuntimeState.runtime = None
     _ark_core.init()
diff --git a/python/ark/module.py b/python/ark/module.py
index 0fdea23b6..0917ea1ed 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -5,10 +5,9 @@
 import numpy as np
 from typing import Any, Dict, Union
 from .tensor import Tensor, Parameter
-from .runtime import Runtime, Planner
+from .runtime import Runtime
 from .init import init
-from .ops import tensor
-from .data_type import DataType
+from .model import Model
 
 try:
     import torch
@@ -78,6 +77,7 @@ def load_state_dict(
         self,
         state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
         prefix: str = "",
+        stream: int = 0,
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -91,7 +91,7 @@ def load_state_dict(
             data = state_dict.get(name, None)
             if data is None:
                 continue
-            param.copy(data)
+            param.copy(data, stream=stream)
             all_keys.remove(name)
         if all_keys:
             logging.warning(
@@ -99,7 +99,10 @@ def load_state_dict(
             )
 
     def state_dict(
-        self, prefix: str = "", mode: str = "numpy"
+        self,
+        prefix: str = "",
+        mode: str = "numpy",
+        stream: int = 0,
     ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
         """
         Copies the parameters from the device GPU to the host and saves the
@@ -108,11 +111,13 @@ def state_dict(
         """
         if mode == "numpy":
             return {
-                k: v.to_numpy() for k, v in self.params_dict(prefix).items()
+                k: v.to_numpy(stream=stream)
+                for k, v in self.params_dict(prefix).items()
             }
         elif mode == "torch":
             return {
-                k: v.to_torch() for k, v in self.params_dict(prefix).items()
+                k: v.to_torch(stream=stream)
+                for k, v in self.params_dict(prefix).items()
             }
         raise ValueError(f"Unsupported mode: {mode}")
 
@@ -127,17 +132,7 @@ def initialize(self):
             module.initialize()
 
 
-def _recursive_ark_to_torch(object):
-    if isinstance(object, Tensor):
-        return object.to_torch()
-    if isinstance(object, dict):
-        return {k: _recursive_ark_to_torch(v) for k, v in object.items()}
-    if isinstance(object, list):
-        return [_recursive_ark_to_torch(v) for v in object]
-    return object
-
-
-class _ARKFunction(torch.autograd.Function):
+class _Function(torch.autograd.Function):
     """
     Facilitates the integration of ARK modules with PyTorch's
     autograd system by defining custom forward and backward passes that
@@ -150,7 +145,7 @@ def forward(ctx, ark_module, *args, **kwargs):
         Returns a PyTorch tensor that is the result
         of the forward pass of the ARK module.
         """
-        init(keep_runtime=True)
+        Model.reset()
         ctx.ark_module = ark_module
         input_args, input_kwargs = [], {}
         input_requires_grad = 0
@@ -184,12 +179,12 @@ def backward(ctx, *grad_outputs):
         and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
         PyTorch parameters.
         """
-        init(keep_runtime=True)
+        Model.reset()
         ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
         grads = ctx.ark_module.backward(*ark_grad_outputs)
         grad_inputs, grad_weights = (
-            grads[:ctx.num_inp_grad],
-            grads[ctx.num_inp_grad:],
+            grads[: ctx.num_inp_grad],
+            grads[ctx.num_inp_grad :],
         )
         params_dict = ctx.ark_module.params_dict()
         rt = Runtime.get_runtime()
@@ -214,4 +209,4 @@ def __init__(self, ark_module):
         self.ark_module = ark_module
 
     def forward(self, *args, **kwargs):
-        return _ARKFunction.apply(self.ark_module, *args, **kwargs)
+        return _Function.apply(self.ark_module, *args, **kwargs)
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 071eedd04..1523905d7 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -3,7 +3,6 @@
 
 import logging
 from enum import Enum
-from typing import Dict, List
 
 from _ark_core import _Executor
 from .planner import Planner, Plan
@@ -169,9 +168,9 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, delete=False, persist=False):
+    def reset(self, persist=False):
         """
-        Reset the runtime. If delete is True, delete the runtime.
+        Reset the runtime.
         """
         if self.launched():
             self.stop()
@@ -182,6 +181,3 @@ def reset(self, delete=False, persist=False):
                 self.executor.destroy()
             self.executor = None
         self.state = Runtime.State.Init
-        if delete:
-            del _RuntimeState.runtime
-            _RuntimeState.runtime = None
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index ba1af52db..3fda8b3b3 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -217,8 +217,10 @@ class Parameter(Tensor, torch.nn.Parameter):
     """
     A tensor as a parameter.
     """
+
     def __init__(
-        self, tensor: Union[_Tensor, "torch.nn.Parameter"],
+        self,
+        tensor: Union[_Tensor, "torch.nn.Parameter"],
     ):
         """
         Initializes a new instance of the Parameter class.
@@ -237,9 +239,7 @@ def __init__(
             core_tensor = tensor
             self.torch_param = None
             self.staged_tensor = None
-            Tensor.__init__(
-                self, core_tensor, requires_grad=False
-            )
+            Tensor.__init__(self, core_tensor, requires_grad=False)
         else:
             raise TypeError(
                 "tensor must be an ARK tensor or a torch.nn.Parameter"
diff --git a/python/ark/torch_mock.py b/python/ark/torch_mock.py
index d0894a621..7a7de0ae6 100644
--- a/python/ark/torch_mock.py
+++ b/python/ark/torch_mock.py
@@ -29,21 +29,15 @@ class ubyte: ...
 class Tensor: ...
 
 
-
 class nn:
 
-
     class Module: ...
-    
 
-    class Parameter: ... 
+    class Parameter: ...
 
 
 class autograd:
 
-
-    class Function: 
-
+    class Function:
 
         def apply(self, *args, **kwargs): ...
-
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 4b67b48a0..e10277646 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -182,8 +182,13 @@ void register_executor(py::module &m) {
         .def("barrier", &ark::Executor::barrier)
         .def("destroy", &ark::Executor::destroy)
         .def("destroyed", &ark::Executor::destroyed)
-        .def("tensor_address", &ark::Executor::tensor_address,
-             py::arg("tensor"))
+        .def(
+            "tensor_address",
+            [](ark::Executor *self, const ark::Tensor &tensor) {
+                return reinterpret_cast<uintptr_t>(
+                    self->tensor_address(tensor));
+            },
+            py::arg("tensor"))
         .def("tensor_read",
              py::overload_cast<ark::Executor *, const ark::Tensor &, py::buffer,
                                uintptr_t>(&tensor_read),
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index b4fa838a1..b368bb93a 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -5,9 +5,6 @@
 import numpy as np
 
 
-empty_plan = ark.Plan(None)
-
-
 def test_runtime_relaunch():
     ark.init()
     with ark.Runtime.get_runtime() as rt:
@@ -39,7 +36,7 @@ def test_add_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset(persist=True)
-    ark.init(keep_runtime=True)
+    ark.Model.reset()
     prev_output = output_tensor
     new_tensor = ark.tensor([M, N], ark.fp16)
     final_output = ark.add(prev_output, new_tensor)
@@ -53,6 +50,7 @@ def test_add_plans():
     )
     runtime.reset()
 
+
 def test_reuse_plans():
     ark.init()
     M, N = 64, 64
@@ -71,7 +69,7 @@ def test_reuse_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset(persist=True)
-    ark.init(keep_runtime=True)
+    ark.Model.reset()
     runtime.launch()
     runtime.run()
     output_tensor_host = output_tensor.to_numpy()
@@ -79,4 +77,3 @@ def test_reuse_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset()
-

From b8e13b43a28dd018a97ca1205d32f1ef7fe510e7 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 11 Aug 2024 23:33:46 +0000
Subject: [PATCH 062/106] a few fixes & more verfication

---
 ark/api/executor.cpp     | 29 ++++++++++++---------
 ark/model/model_json.cpp | 54 ++++++++++++++++++++++++++++++++++------
 2 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4634ed6fd..c424271cc 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -154,7 +154,9 @@ class Executor::Impl {
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
 
-    std::shared_ptr<GpuMemory> buffer() const { return buffers_.back(); }
+    std::shared_ptr<GpuMemory> buffer() const {
+        return buffers_.empty() ? nullptr : buffers_.back();
+    }
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
@@ -177,7 +179,8 @@ class Executor::Impl {
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
     std::map<size_t, void *> init_buffer_addrs(
-        void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset);
+        std::shared_ptr<GpuMemory> buffer,
+        const std::map<size_t, size_t> &buffer_id_to_offset);
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
@@ -275,11 +278,10 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     if (total_bytes_ > 0) {
         buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
         is_buffer_allocated_ = true;
+        buffer_id_to_addr_ =
+            init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
     }
 
-    buffer_id_to_addr_ =
-        init_buffer_addrs(buffers_.back()->ref(), buffer_id_to_offset_);
-
     codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
                                                external_args_,
                                                buffer_id_to_name_, name_);
@@ -293,7 +295,7 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     size_t smem_block_total =
         static_cast<size_t>(gpu_manager->info().smem_block_total);
 
-    if (world_size_ > 1) {
+    if (world_size_ > 1 && total_bytes_ > 0) {
         auto remote_ranks = init_remote_ranks(plan_json_);
         init_channels(remote_ranks);
     }
@@ -325,7 +327,8 @@ void Executor::Impl::init_communicator() {
 }
 
 std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
-    void *buffer_base, const std::map<size_t, size_t> &buffer_id_to_offset) {
+    std::shared_ptr<GpuMemory> buffer,
+    const std::map<size_t, size_t> &buffer_id_to_offset) {
     std::map<size_t, void *> buffer_id_to_addr;
     // Reuse existing buffer addresses for new plans that use previous tensors
     // from earlier plans
@@ -333,8 +336,7 @@ std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
         buffer_id_to_addr = buffer_id_to_addr_;
     }
     for (const auto &kv : buffer_id_to_offset) {
-        buffer_id_to_addr[kv.first] =
-            static_cast<char *>(buffer_base) + kv.second;
+        buffer_id_to_addr[kv.first] = buffer->ref(kv.second);
     }
     return buffer_id_to_addr;
 }
@@ -772,7 +774,7 @@ void Executor::Impl::launch() {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
         void *flag_ptr = flag_->ref();
-        void *buf_ptr = buffers_.back()->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
@@ -790,7 +792,7 @@ void Executor::Impl::run(int iter) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = buffers_.back()->ref();
+        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
         for (auto &buffer : external_buffers_) {
@@ -865,7 +867,10 @@ void Executor::Impl::barrier() {
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
     if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
-        ERR(InternalError, "Invalid buffer ID: ", buffer_id);
+        ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
+            ". This is likely caused by accessing a tensor that is optimized "
+            "out by the compiler or not used in any plan passed to the "
+            "executor.");
     }
     return buffer_id_to_addr_.at(buffer_id);
 }
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index c2099e2c9..dad62cb4e 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -5,6 +5,7 @@
 
 #include <sstream>
 
+#include "ark/dims.hpp"
 #include "logging.hpp"
 
 static std::stringstream &idnt(std::stringstream &ss, int indent) {
@@ -26,14 +27,46 @@ static void verify_format_json(const std::string &name, const Json &json,
                                const std::vector<std::string> &array_fields) {
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
-            ERR(ErrorType,
-                name + ": " + field + " not found. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " not found. Given: ", json.dump());
         }
     }
     for (const auto &field : array_fields) {
         if (!json.at(field).is_array()) {
-            ERR(ErrorType, name + ": " + field +
-                               " is not an array. Given: " + json.dump());
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+    }
+}
+
+template <typename ErrorType, bool ZeroNotAllowed>
+static void verify_format_dims(const std::string &name, const Json &json,
+                               const std::vector<std::string> &dims_fields) {
+    for (const auto &field : dims_fields) {
+        if (!json.at(field).is_array()) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array. Given: ", json.dump());
+        }
+        std::vector<DimType> dims;
+        try {
+            dims = json.at(field).get<std::vector<DimType>>();
+        } catch (const std::exception &e) {
+            ERR(ErrorType, name, ": ", field,
+                " is not an array of integers. Given: ", json.dump());
+        }
+        for (const auto &dim : dims) {
+            if (dim < 0) {
+                ERR(ErrorType, name, ": ", field,
+                    " contains negative value. Given: ", json.dump());
+            }
+        }
+        if (ZeroNotAllowed) {
+            for (const auto &dim : dims) {
+                if (dim == 0) {
+                    ERR(ErrorType, name, ": ", field,
+                        " contains zero value. Given: ", json.dump());
+                }
+            }
         }
     }
 }
@@ -52,10 +85,15 @@ static void verify_format_tensor(const Json &json) {
     const std::vector<std::string> required_fields = {
         "Id",      "DataType",    "Shape", "Strides",
         "Offsets", "PaddedShape", "Buffer"};
-    const std::vector<std::string> array_fields = {"Shape", "Strides",
-                                                   "Offsets", "PaddedShape"};
-    verify_format_json<ErrorType>("TensorJson", json, required_fields,
-                                  array_fields);
+    const std::vector<std::string> dims_fields = {"Shape", "Strides", "Offsets",
+                                                  "PaddedShape"};
+    verify_format_json<ErrorType>("TensorJson", json, required_fields, {});
+    verify_format_dims<ErrorType, false>("TensorJson", json,
+                                         {
+                                             "Offsets",
+                                         });
+    verify_format_dims<ErrorType, true>("TensorJson", json,
+                                        {"Shape", "Strides", "PaddedShape"});
     verify_format_buffer<ErrorType>(json.at("Buffer"));
 }
 

From 5ba79f9a72679f727e532927903f91a901c02048 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:23:06 +0000
Subject: [PATCH 063/106] Align C++ Executor interface with Python interface

---
 ark/api/executor.cpp               | 131 +++++++++++++++--------------
 ark/api/executor_test.cpp          |  12 +--
 ark/include/ark/executor.hpp       |  21 ++---
 ark/ops/ops_communication_test.cpp |  12 +--
 ark/ops/ops_identity_test.cpp      |   1 -
 ark/ops/ops_reshape_test.cpp       |   1 -
 ark/ops/ops_scalar_test.cpp        |   3 -
 ark/ops/ops_tensor_test.cpp        |   3 -
 ark/ops/ops_test_common.cpp        |   1 -
 python/ark/__init__.py             |   2 +-
 python/ark/module.py               |  10 +--
 python/ark/runtime.py              |  70 +++++----------
 python/executor_py.cpp             |  20 ++---
 python/unittest/test_runtime.py    |   8 +-
 14 files changed, 125 insertions(+), 170 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index c424271cc..9d9d79a43 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -145,11 +145,9 @@ static size_t tensor_stride_bytes(const Json &tensor) {
 
 class Executor::Impl {
    public:
-    Impl(int device_id, Stream stream, const std::string &name, bool loop_mode);
+    Impl() : plan_json_(), device_id_(-1) {};
     ~Impl();
 
-    void init(const PlanJson &plan);
-
     int device_id() const { return device_id_; }
 
     Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
@@ -160,9 +158,11 @@ class Executor::Impl {
 
     std::string plan() const { return plan_json_.dump_pretty(); }
 
-    void add_plan(const std::string &plan);
-    void compile();
-    void launch();
+    const std::string &name() const { return name_; }
+
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name);
+    void launch(Stream stream, bool loop_mode);
     void run(int iter);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
@@ -175,7 +175,15 @@ class Executor::Impl {
     void tensor_write(const Tensor &tensor, const void *data, size_t bytes,
                       Stream stream, bool is_d2d) const;
 
+   protected:
+    friend class DefaultExecutor;
+
+    gpuStream stream_raw_;
+    bool loop_mode_;
+
    private:
+    void init(const PlanJson &plan_json, int device_id,
+              const std::string &name);
     void init_communicator();
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
     std::map<size_t, void *> init_buffer_addrs(
@@ -184,14 +192,9 @@ class Executor::Impl {
     std::set<int> init_remote_ranks(const Json &plan_json) const;
     void init_channels(const std::set<int> &remote_ranks);
 
-   protected:
+    PlanJson plan_json_;
     int device_id_;
     std::string name_;
-    bool loop_mode_;
-
-    bool is_buffer_allocated_;
-
-    gpuStream stream_raw_;
 
     int rank_;
     int world_size_;
@@ -200,7 +203,6 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
-    PlanJson plan_json_;
     std::vector<void *> external_buffers_;
     std::vector<std::string> external_args_;
     std::map<size_t, std::string> buffer_id_to_name_;
@@ -224,26 +226,25 @@ class Executor::Impl {
         rank_to_sm_channels_;
 };
 
-Executor::Impl::Impl(int device_id, Stream stream, const std::string &name,
-                     bool loop_mode)
-    : device_id_(device_id), name_(name), loop_mode_(loop_mode) {
-    if (device_id < 0) {
-        ERR(InvalidUsageError, "Invalid device ID ", device_id);
-    }
-    if (stream) {
-        stream_raw_ = reinterpret_cast<gpuStream>(stream);
-    } else {
-        stream_ = GpuManager::get_instance(device_id_)->create_stream();
-        stream_raw_ = stream_->get();
-    }
-}
-
 Executor::Impl::~Impl() {
     if (is_launched_) stop(-1);
 }
 
-void Executor::Impl::init(const PlanJson &plan_json) {
+void Executor::Impl::init(const PlanJson &plan_json, int device_id,
+                          const std::string &name) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
+    }
+
     plan_json_ = plan_json;
+    device_id_ = device_id;
+    name_ = name;
+
+    external_buffers_.clear();
+    external_args_.clear();
+    buffer_id_to_name_.clear();
+    total_bytes_ = 0;
+
     rank_ = plan_json_["Rank"].get<int>();
     world_size_ = plan_json_["WorldSize"].get<int>();
 
@@ -277,7 +278,6 @@ void Executor::Impl::init(const PlanJson &plan_json) {
     timer_end_ = gpu_manager->create_event();
     if (total_bytes_ > 0) {
         buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
-        is_buffer_allocated_ = true;
         buffer_id_to_addr_ =
             init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
     }
@@ -700,25 +700,32 @@ void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
     }
 }
 
-void Executor::Impl::add_plan(const std::string &plan) {
-    external_buffers_.clear();
-    external_args_.clear();
-    buffer_id_to_name_.clear();
-    total_bytes_ = 0;
-    is_buffer_allocated_ = false;
-    init(Json::parse(plan));
+void Executor::Impl::compile(const std::string &plan, int device_id,
+                             const std::string &name) {
+    if (is_launched_) {
+        ERR(InvalidUsageError, "Need to stop before re-compiling.");
+        return;
+    }
+    init(PlanJson::parse(plan), device_id, name);
+    kernel_->compile();
 }
 
-void Executor::Impl::compile() { kernel_->compile(); }
-
-void Executor::Impl::launch() {
-    if (!kernel_->is_compiled()) {
-        ERR(InvalidUsageError, "Need to compile first before initialization.");
+void Executor::Impl::launch(Stream stream, bool loop_mode) {
+    if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
+        ERR(InvalidUsageError, "Need to compile first before launch.");
     }
     if (is_launched_) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
+    if (stream) {
+        stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    } else {
+        stream_ = GpuManager::get_instance(device_id_)->create_stream();
+        stream_raw_ = stream_->get();
+    }
+    loop_mode_ = loop_mode;
+
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
     };
@@ -773,8 +780,8 @@ void Executor::Impl::launch() {
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *flag_ptr = flag_->ref();
         void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
+        void *flag_ptr = flag_->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
@@ -990,18 +997,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     GLOG(gpuStreamSynchronize(copy_stream_raw));
 }
 
-Executor::Executor(int device_id, Stream stream, const std::string &name,
-                   const std::string &plan, bool loop_mode)
-    : impl_(std::make_unique<Executor::Impl>(device_id, stream, name,
-                                             loop_mode)) {
-    auto &plan_path = get_env().enforce_plan_path;
-    if (!plan_path.empty()) {
-        LOG(INFO, "Enforce executor plan path: ", plan_path);
-        impl_->init(Json::parse(read_file(plan_path)));
-    } else if (!plan.empty()) {
-        impl_->init(Json::parse(plan));
-    }
-}
+Executor::Executor() : impl_(std::make_unique<Executor::Impl>()) {}
 
 Executor::~Executor() = default;
 
@@ -1013,11 +1009,16 @@ std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
 
 std::string Executor::plan() const { return impl_->plan(); }
 
-void Executor::add_plan(const std::string &plan) { impl_->add_plan(plan); }
+const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::compile() { impl_->compile(); }
+void Executor::compile(int device_id, const std::string &plan,
+                       const std::string &name) {
+    impl_->compile(device_id, plan, name);
+}
 
-void Executor::launch() { impl_->launch(); }
+void Executor::launch(Stream stream, bool loop_mode) {
+    impl_->launch(stream, loop_mode);
+}
 
 void Executor::run(int iter) { impl_->run(iter); }
 
@@ -1054,14 +1055,20 @@ DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<Planner::ConfigRule> &config_rules,
     const std::string &name, bool loop_mode)
-    : Executor((device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
-                               : device_id,
-               stream, name, "", loop_mode) {
-    Planner planner(model, impl_->device_id());
+    : Executor() {
+    device_id = (device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
+                                : device_id;
+    Planner planner(model, device_id);
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
     }
-    impl_->init(Json::parse(planner.plan()));
+    compile(device_id, planner.plan(), name);
+    impl_->stream_raw_ = reinterpret_cast<gpuStream>(stream);
+    impl_->loop_mode_ = loop_mode;
+}
+
+void DefaultExecutor::launch() {
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_), impl_->loop_mode_);
 }
 
 }  // namespace ark
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index 75d506ecb..e54578dfc 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_executor() {
         UNITTEST_EQ(executor.device_id(), 0);
         UNITTEST_EQ(executor.stream(), stream);
 
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -31,7 +30,6 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        executor.compile();
         executor.launch();
         executor.run(1);
         executor.wait();
@@ -48,7 +46,6 @@ ark::unittest::State test_executor() {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
         UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
 
-        executor.compile();
         executor.launch();
         executor.launch();  // Will be ignored with a warning.
         executor.run(1);
@@ -86,7 +83,6 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     m.noop(tensor);
 
     ark::DefaultExecutor executor(m, 0);
-    executor.compile();
     executor.launch();
     UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
@@ -169,15 +165,15 @@ ark::unittest::State test_executor_tensor_read_write_stride_offset() {
 }
 
 ark::unittest::State test_executor_invalid() {
+    ark::Executor exe;
+
     // Invalid device ID.
-    UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
-                   ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(-1, ""), ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;
     plan["Rank"] = 1;
-    UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
-                   ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(0, plan.dump()), ark::InvalidUsageError);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index d44ac2302..8e6577cd2 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -21,8 +21,7 @@ class GpuMemory;
 class Executor {
    public:
     /// Constructor.
-    Executor(int device_id, Stream stream, const std::string &name,
-             const std::string &plan, bool loop_mode = true);
+    Executor();
 
     /// Destructor.
     ~Executor();
@@ -39,23 +38,22 @@ class Executor {
     /// Return the plan string.
     std::string plan() const;
 
-    /// Add a plan to the executor.
-    void add_plan(const std::string &plan);
+    const std::string &name() const;
 
     /// Compile the model. This must be called before `launch()`.
-    void compile();
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name = "executor");
 
-    /// Launch the model (not running yet). This must be called after
-    /// `compile()`.
-    void launch();
+    /// Launch the executor. This must be called after `compile()`.
+    void launch(Stream stream = nullptr, bool loop_mode = true);
 
-    /// Run the model for `iter` iterations.
+    /// Run the executor for `iter` iterations.
     void run(int iter);
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
 
-    /// Stop the model and return the elapsed time in milliseconds.
+    /// Stop the executor and return the elapsed time in milliseconds.
     /// Once this is called, we need to call `launch()` again to run the model
     /// again.
     float stop(int64_t max_spin_count = -1);
@@ -105,6 +103,9 @@ class DefaultExecutor : public Executor {
         const Model &model, int device_id = -1, Stream stream = nullptr,
         const std::vector<Planner::ConfigRule> &config_rules = {},
         const std::string &name = "DefaultExecutor", bool loop_mode = true);
+
+    /// Launch the default executor.
+    void launch();
 };
 
 }  // namespace ark
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 8cdad41b2..7a7fec523 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -25,7 +25,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -68,7 +67,6 @@ ark::unittest::State test_communication_send_recv_unidir() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 1) {
                 std::vector<ark::half_t> data(1024);
@@ -117,7 +115,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -161,7 +158,6 @@ ark::unittest::State test_communication_send_recv_bidir() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -232,7 +228,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             tns2 = model.recv(tns2_data, remote_gpu_id, tag);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -276,7 +271,6 @@ ark::unittest::State test_communication_send_recv_bidir_sm() {
             ark::Tensor sum = model.add(tns2, tns_data);
 
             ark::DefaultExecutor exe(model, gpu_id, nullptr, {config_rule});
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), ark::half_t(gpu_id + 1));
@@ -319,7 +313,6 @@ ark::unittest::State test_communication_send_packet() {
             }
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             if (gpu_id == 0) {
                 std::vector<ark::half_t> data(1024);
@@ -362,7 +355,6 @@ ark::unittest::State test_communication_send_recv_reduce_packet() {
             model.recv_packet(shard_tensors[peer_gpu_id], peer_gpu_id, 1, 1);
 
             ark::DefaultExecutor exe(model, gpu_id);
-            exe.compile();
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
@@ -433,8 +425,8 @@ ark::unittest::State test_communication_send_recv_reduce() {
 
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
-            ark::Executor exe(gpu_id, nullptr, "Executor", planner.plan());
-            exe.compile();
+            ark::Executor exe;
+            exe.compile(gpu_id, planner.plan());
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);
diff --git a/ark/ops/ops_identity_test.cpp b/ark/ops/ops_identity_test.cpp
index a6e49c9c0..eb8d3f4d4 100644
--- a/ark/ops/ops_identity_test.cpp
+++ b/ark/ops/ops_identity_test.cpp
@@ -58,7 +58,6 @@ ark::unittest::State test_ops_identity() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     int num_elem = 2 * 3 * 4 * 5;
 
diff --git a/ark/ops/ops_reshape_test.cpp b/ark/ops/ops_reshape_test.cpp
index 1128c955a..7bb8aa4be 100644
--- a/ark/ops/ops_reshape_test.cpp
+++ b/ark/ops/ops_reshape_test.cpp
@@ -9,7 +9,6 @@
 void test_reshape_checker(ark::Model &m, ark::Tensor t0, ark::Tensor t1,
                           const std::string &) {
     ark::DefaultExecutor exe(m);
-    exe.compile();
 
     std::vector<float> data_vec(t0.shape().nelems());
     std::iota(data_vec.begin(), data_vec.end(), 1.0f);
diff --git a/ark/ops/ops_scalar_test.cpp b/ark/ops/ops_scalar_test.cpp
index 6afc9e1ad..47a5b40bd 100644
--- a/ark/ops/ops_scalar_test.cpp
+++ b/ark/ops/ops_scalar_test.cpp
@@ -66,7 +66,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor t = m.constant(7, ark::Dims(4, 2, 50), ark::FP16);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
@@ -84,7 +83,6 @@ ark::unittest::State test_scalar_assign_fp16() {
         ark::Tensor out = m.copy(7, t);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         std::vector<ark::half_t> data(4 * 2 * 50, 3);
         exe.tensor_write(t, data);
@@ -109,7 +107,6 @@ ark::unittest::State test_scalar_assign_fp32() {
         ark::Tensor out = m.copy(7);
 
         ark::DefaultExecutor exe(m);
-        exe.compile();
 
         exe.launch();
         exe.run(1);
diff --git a/ark/ops/ops_tensor_test.cpp b/ark/ops/ops_tensor_test.cpp
index be6488ef1..a2c36fd8c 100644
--- a/ark/ops/ops_tensor_test.cpp
+++ b/ark/ops/ops_tensor_test.cpp
@@ -20,7 +20,6 @@ ark::unittest::State test_tensor_strides() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, 4.0}
     std::vector<float> data(shape.nelems());
@@ -53,7 +52,6 @@ ark::unittest::State test_tensor_memcpy() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill buffer data: {1.0, 2.0, 3.0, ..., 3024.0}
     std::vector<float> data(strides.nelems());
@@ -138,7 +136,6 @@ ark::unittest::State test_tensor_layout() {
 
     // Create an executor
     ark::DefaultExecutor exe(model);
-    exe.compile();
 
     // Fill tensor data: {1.0, 2.0, 3.0, ..., 120.0}
     std::vector<float> data(2 * 3 * 4 * 5);
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 4e94d06a7..42f7e670e 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -38,7 +38,6 @@ OpsTestResult op_test(
     const std::vector<Planner::ConfigRule> &config_rules,
     bool print_on_error) {
     DefaultExecutor exe(model, -1, nullptr, config_rules);
-    exe.compile();
 
     std::vector<std::shared_ptr<std::vector<char>>> inputs_data_storages;
     std::vector<void *> inputs_data_refs;
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 68b03ab29..939c4837f 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -39,7 +39,7 @@ def set_world_size(world_size):
 from .init import init
 from .tensor import Dims, Tensor, Parameter
 from .module import Module, RuntimeModule
-from .runtime import Runtime
+from .runtime import *
 from .serialize import save, load
 from .data_type import (
     DataType,
diff --git a/python/ark/module.py b/python/ark/module.py
index 0917ea1ed..49d2ddf00 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -168,8 +168,8 @@ def forward(ctx, ark_module, *args, **kwargs):
         rt = Runtime.get_runtime()
         rt.launch()
         rt.run()
-        output = output.get_torch_view()
-        rt.reset(persist=True)
+        rt.stop()
+        output = output.to_torch()
         return output
 
     @staticmethod
@@ -190,12 +190,12 @@ def backward(ctx, *grad_outputs):
         rt = Runtime.get_runtime()
         rt.launch()
         rt.run()
-        grad_inputs = [grad.get_torch_view() for grad in grad_inputs]
+        rt.stop()
+        grad_inputs = [grad.to_torch() for grad in grad_inputs]
         for _, param in params_dict.items():
             if param.staged_tensor is not None:
-                pytorch_grad = param.staged_tensor.get_torch_view()
+                pytorch_grad = param.staged_tensor.to_torch()
                 param.torch_param.grad = pytorch_grad
-        rt.reset(persist=True)
         return (None, *grad_inputs)
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 1523905d7..f3baf3994 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -16,10 +16,6 @@ class _RuntimeState:
     runtime = None
 
 
-class Executor(_Executor):
-    pass
-
-
 class Runtime:
     """
     Convenience class for running a model.
@@ -35,16 +31,11 @@ class State(Enum):
         Running = 2
 
     def __init__(self):
-        self.executor: Executor = None
+        self.executor: _Executor = _Executor()
         self.state: Runtime.State = Runtime.State.Init
+        self.loop_mode = True
         _RuntimeState.runtime = self
 
-    def get_state(self) -> "Runtime.State":
-        """
-        Get the runtime state.
-        """
-        return self.state
-
     @staticmethod
     def get_runtime() -> "Runtime":
         """
@@ -76,14 +67,6 @@ def running(self) -> bool:
         """
         return self.state == Runtime.State.Running
 
-    def add_plan(self, plan: Plan):
-        """
-        Add a plan to the executor.
-        """
-        if self.executor is None:
-            raise RuntimeError("Executor is not initialized")
-        self.executor.add_plan(str(plan))
-
     def launch(
         self,
         plan: Plan = None,
@@ -96,33 +79,21 @@ def launch(
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
+        if device_id < 0:
+            logging.error(f"Invalid device_id: {device_id}")
+            raise ValueError(f"Invalid device_id: {device_id}")
         plan = Planner(device_id).plan() if plan is None else plan
+        plan_str = str(plan)
         if self.launched():
-            # If the Runtime state is already launched and we are adding another plan
-            # to the executor, we compile the new kernel and launch the executor again.
-            self.executor.add_plan(str(plan))
-            self.executor.compile()
-            self.executor.launch()
-            return
+            # Stop the current running model
+            self.stop()
+
+        # Recompile if the previous launch was not compiled with the same info
+        # or if this is the first launch
+        if plan_str != self.executor.plan() or device_id != self.executor.device_id():
+            self.executor.compile(plan_str, device_id)
 
-        # If the RuntimeState is init, we need to create a new executor and
-        # compile the kernels
-        if self.state == Runtime.State.Init:
-            if self.executor is not None:
-                if not self.executor.destroyed():
-                    logging.warning(
-                        f"Runtime has already been launched. Destroying the old executor"
-                    )
-                    self.executor.destroy()
-            self.executor = Executor(
-                device_id,
-                stream,
-                "ArkRuntime",
-                str(plan),
-                loop_mode,
-            )
-            self.executor.compile()
-        self.executor.launch()
+        self.executor.launch(stream, loop_mode)
         self.state = Runtime.State.LaunchedNotRunning
 
     def run(self, iter=1, non_blocking=False):
@@ -168,16 +139,15 @@ def stop(self) -> float:
         self.state = Runtime.State.LaunchedNotRunning
         return elapsed
 
-    def reset(self, persist=False):
+    def reset(self):
         """
         Reset the runtime.
         """
         if self.launched():
             self.stop()
-        if persist:
-            return
-        if self.executor is not None:
-            if not self.executor.destroyed():
-                self.executor.destroy()
-            self.executor = None
+        self.executor.destroy()
+        self.executor = _Executor()
         self.state = Runtime.State.Init
+
+
+__all__ = ["Runtime"]
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index e10277646..5b4e7959f 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -161,21 +161,20 @@ static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tens
 
 void register_executor(py::module &m) {
     py::class_<ark::Executor>(m, "_Executor")
-        .def(py::init([](int device_id, uintptr_t stream,
-                         const std::string &name, const std::string &plan,
-                         bool loop_mode) {
-            return new ark::Executor(device_id,
-                                     reinterpret_cast<ark::Stream>(stream),
-                                     name, plan, loop_mode);
-        }))
+        .def(py::init<>())
         .def("device_id", &ark::Executor::device_id)
         .def("stream",
              [](ark::Executor *self) {
                  return reinterpret_cast<uintptr_t>(self->stream());
              })
         .def("plan", &ark::Executor::plan)
-        .def("compile", &ark::Executor::compile)
-        .def("launch", &ark::Executor::launch)
+        .def("name", &ark::Executor::name)
+        .def("compile", &ark::Executor::compile, py::arg("device_id"),
+             py::arg("plan"), py::arg("name") = "executor")
+        .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
+                 self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
+             },
+             py::arg("stream") = 0, py::arg("loop_mode") = true)
         .def("run", &ark::Executor::run, py::arg("iter"))
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)
@@ -207,6 +206,5 @@ void register_executor(py::module &m) {
                                size_t, uintptr_t, bool>(&tensor_write),
              py::arg("tensor"), py::arg("address"), py::arg("bytes"),
              py::arg("stream"), py::arg("is_d2d"))
-        .def("tensor_to_dlpack", &tensor_to_dlpack)
-        .def("add_plan", &ark::Executor::add_plan, py::arg("plan"));
+        .def("tensor_to_dlpack", &tensor_to_dlpack);
 }
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index b368bb93a..356430d9a 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -18,7 +18,7 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
-def test_add_plans():
+def test_runtime_init():
     ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
@@ -35,7 +35,7 @@ def test_add_plans():
     np.testing.assert_allclose(
         output_tensor_host, input_tensor_host + other_tensor_host
     )
-    runtime.reset(persist=True)
+    runtime.stop()
     ark.Model.reset()
     prev_output = output_tensor
     new_tensor = ark.tensor([M, N], ark.fp16)
@@ -51,7 +51,7 @@ def test_add_plans():
     runtime.reset()
 
 
-def test_reuse_plans():
+def test_runtime_reuse_plans():
     ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
@@ -68,7 +68,7 @@ def test_reuse_plans():
     np.testing.assert_allclose(
         output_tensor_host, input_tensor_host + other_tensor_host
     )
-    runtime.reset(persist=True)
+    runtime.stop()
     ark.Model.reset()
     runtime.launch()
     runtime.run()

From b0176ad47191e7cfda6db5f36b4f2c6dddc8c0d6 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:24:12 +0000
Subject: [PATCH 064/106] lint

---
 python/ark/runtime.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index f3baf3994..1490cdeb8 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -90,7 +90,10 @@ def launch(
 
         # Recompile if the previous launch was not compiled with the same info
         # or if this is the first launch
-        if plan_str != self.executor.plan() or device_id != self.executor.device_id():
+        if (
+            plan_str != self.executor.plan()
+            or device_id != self.executor.device_id()
+        ):
             self.executor.compile(plan_str, device_id)
 
         self.executor.launch(stream, loop_mode)

From 4db38e131056a476bc6bb3baeaf220ee96abcb10 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 02:26:42 +0000
Subject: [PATCH 065/106] minor fix

---
 ark/api/executor.cpp               | 6 +++---
 ark/api/executor_test.cpp          | 4 ++--
 ark/ops/ops_communication_test.cpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 9d9d79a43..4505b2a35 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -1011,9 +1011,9 @@ std::string Executor::plan() const { return impl_->plan(); }
 
 const std::string &Executor::name() const { return impl_->name(); }
 
-void Executor::compile(int device_id, const std::string &plan,
+void Executor::compile(const std::string &plan, int device_id,
                        const std::string &name) {
-    impl_->compile(device_id, plan, name);
+    impl_->compile(plan, device_id, name);
 }
 
 void Executor::launch(Stream stream, bool loop_mode) {
@@ -1062,7 +1062,7 @@ DefaultExecutor::DefaultExecutor(
     for (const auto &rule : config_rules) {
         planner.install_config_rule(rule);
     }
-    compile(device_id, planner.plan(), name);
+    compile(planner.plan(), device_id, name);
     impl_->stream_raw_ = reinterpret_cast<gpuStream>(stream);
     impl_->loop_mode_ = loop_mode;
 }
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index e54578dfc..2cc3ee1c2 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -168,12 +168,12 @@ ark::unittest::State test_executor_invalid() {
     ark::Executor exe;
 
     // Invalid device ID.
-    UNITTEST_THROW(exe.compile(-1, ""), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile("", -1), ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;
     plan["Rank"] = 1;
-    UNITTEST_THROW(exe.compile(0, plan.dump()), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(plan.dump(), 0), ark::InvalidUsageError);
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 7a7fec523..39c466909 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -426,7 +426,7 @@ ark::unittest::State test_communication_send_recv_reduce() {
             ark::Planner planner(model, gpu_id);
             planner.install_config_rule(config_rule);
             ark::Executor exe;
-            exe.compile(gpu_id, planner.plan());
+            exe.compile(planner.plan(), gpu_id);
 
             std::vector<ark::half_t> data(1024);
             std::iota(data.begin(), data.end(), 1.0f);

From 15d423bed4fa399b02aefef5d0a9ed06d78fde7a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 03:39:05 +0000
Subject: [PATCH 066/106] more fixes

---
 ark/api/executor.cpp        | 38 +++++++++++++++++-----------------
 ark/api/executor_test.cpp   |  1 -
 ark/gpu/gpu_kernel.cpp      | 41 ++++++++++++++++++++-----------------
 ark/gpu/gpu_kernel.hpp      | 11 +++++-----
 ark/gpu/gpu_kernel_test.cpp |  4 ++--
 5 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 4505b2a35..626fed808 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -199,6 +199,8 @@ class Executor::Impl {
     int rank_;
     int world_size_;
 
+    std::string kernel_name_;
+
     bool is_launched_ = false;
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
@@ -300,21 +302,9 @@ void Executor::Impl::init(const PlanJson &plan_json, int device_id,
         init_channels(remote_ranks);
     }
 
-    std::string kernel_name;
-    if (loop_mode_) {
-        // should we add an identifier to specify which plan the kernel executes
-        // i.e. ark_loop_kernel_2 for the second plan
-        kernel_name = "ark_loop_kernel";
-    } else {
-        kernel_name = "ark_kernel";
-    }
-    if (!name_.empty()) {
-        kernel_name += "_" + name_;
-    }
-
-    kernel_ = std::shared_ptr<GpuKernel>(new GpuKernel(
-        device_id_, codegen_->code(), {threads_per_block, 1, 1}, {num_sm, 1, 1},
-        std::max(smem_block_total, size_t(4)), kernel_name));
+    kernel_ = std::shared_ptr<GpuKernel>(
+        new GpuKernel(device_id_, codegen_->code(), {threads_per_block, 1, 1},
+                      {num_sm, 1, 1}, std::max(smem_block_total, size_t(4))));
 }
 
 void Executor::Impl::init_communicator() {
@@ -726,6 +716,17 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     }
     loop_mode_ = loop_mode;
 
+    if (loop_mode_) {
+        // should we add an identifier to specify which plan the kernel executes
+        // i.e. ark_loop_kernel_2 for the second plan
+        kernel_name_ = "ark_loop_kernel";
+    } else {
+        kernel_name_ = "ark_kernel";
+    }
+    if (!name_.empty()) {
+        kernel_name_ += "_" + name_;
+    }
+
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
     };
@@ -786,7 +787,7 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
         for (auto &buffer : external_buffers_) {
             args.push_back(&buffer);
         }
-        kernel_->launch(stream_raw_, args);
+        kernel_->launch(kernel_name_, stream_raw_, args);
     }
     is_recording_ = true;
     is_launched_ = true;
@@ -806,7 +807,7 @@ void Executor::Impl::run(int iter) {
             args.push_back(&buffer);
         }
         for (; i < iter; i++) {
-            kernel_->launch(stream_raw_, args);
+            kernel_->launch(kernel_name_, stream_raw_, args);
         }
     }
 }
@@ -822,9 +823,8 @@ void Executor::Impl::wait(int64_t max_spin_count) {
             gpuError res = gpuStreamQuery(stream_raw_);
             if (res == gpuSuccess) {
                 if (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
-                    LOG(WARN,
+                    ERR(InternalError,
                         "Stream is finished but the loop flag is still set.");
-                    break;
                 } else {
                     LOG(WARN,
                         "wait() is delayed by a stream query. Regarding "
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index 2cc3ee1c2..c8c96fa6d 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -44,7 +44,6 @@ ark::unittest::State test_executor() {
     }
     {
         ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
-        UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
 
         executor.launch();
         executor.launch();  // Will be ignored with a warning.
diff --git a/ark/gpu/gpu_kernel.cpp b/ark/gpu/gpu_kernel.cpp
index d4412f80e..a474b32a7 100644
--- a/ark/gpu/gpu_kernel.cpp
+++ b/ark/gpu/gpu_kernel.cpp
@@ -15,24 +15,18 @@ namespace ark {
 
 GpuKernel::GpuKernel(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
-    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes, kernel_name);
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
+    this->init(gpu_id, code, block_dim, grid_dim, smem_bytes);
 }
 
 void GpuKernel::init(int gpu_id, const std::string& code,
                      const std::array<int, 3>& block_dim,
-                     const std::array<int, 3>& grid_dim, size_t smem_bytes,
-                     const std::string& kernel_name) {
+                     const std::array<int, 3>& grid_dim, size_t smem_bytes) {
     gpu_manager_ = GpuManager::get_instance(gpu_id);
     code_ = code;
     block_dim_ = block_dim;
     grid_dim_ = grid_dim;
     smem_bytes_ = smem_bytes;
-    kernel_name_ = kernel_name;
-    if (kernel_name_.size() == 0) {
-        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name_);
-    }
 }
 
 void GpuKernel::compile() {
@@ -45,21 +39,30 @@ void GpuKernel::compile() {
     }
     bin_ = gpu_compile({code_}, gpu_manager_->info().arch, max_reg_cnt);
     GLOG_DRV(gpuModuleLoadData(&module_, bin_.c_str()));
-    GLOG_DRV(gpuModuleGetFunction(&function_, module_, kernel_name_.c_str()));
-
-    int static_smem_size_bytes;
-    GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
-                                 gpuFuncAttributeSharedSizeBytes, function_));
-    int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
-    GLOG_DRV(gpuFuncSetAttribute(function_,
-                                 gpuFuncAttributeMaxDynamicSharedSizeBytes,
-                                 dynamic_smem_size_bytes));
 }
 
-void GpuKernel::launch(gpuStream stream, std::vector<void*>& args) {
+void GpuKernel::launch(const std::string& kernel_name, gpuStream stream,
+                       std::vector<void*>& args) {
     if (!this->is_compiled()) {
         ERR(InvalidUsageError, "Kernel is not compiled yet.");
     }
+    if (kernel_name.size() == 0) {
+        ERR(InvalidUsageError, "Invalid kernel name: ", kernel_name);
+    }
+    if (kernel_name_ != kernel_name) {
+        GLOG_DRV(
+            gpuModuleGetFunction(&function_, module_, kernel_name.c_str()));
+
+        int static_smem_size_bytes;
+        GLOG_DRV(gpuFuncGetAttribute(&static_smem_size_bytes,
+                                     gpuFuncAttributeSharedSizeBytes,
+                                     function_));
+        int dynamic_smem_size_bytes = smem_bytes_ - static_smem_size_bytes;
+        GLOG_DRV(gpuFuncSetAttribute(function_,
+                                     gpuFuncAttributeMaxDynamicSharedSizeBytes,
+                                     dynamic_smem_size_bytes));
+        kernel_name_ = kernel_name;
+    }
     gpu_manager_->launch(function_, grid_dim_, block_dim_, smem_bytes_, stream,
                          args.data(), nullptr);
     GLOG(gpuGetLastError());
diff --git a/ark/gpu/gpu_kernel.hpp b/ark/gpu/gpu_kernel.hpp
index 5308cfead..1e02cc7a1 100644
--- a/ark/gpu/gpu_kernel.hpp
+++ b/ark/gpu/gpu_kernel.hpp
@@ -18,19 +18,18 @@ class GpuKernel {
    public:
     GpuKernel(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
 
     void init(int gpu_id, const std::string& codes,
               const std::array<int, 3>& block_dim,
-              const std::array<int, 3>& grid_dim, size_t smem_bytes,
-              const std::string& kernel_name);
+              const std::array<int, 3>& grid_dim, size_t smem_bytes);
     void compile();
-    void launch(gpuStream stream, std::vector<void*>& args);
+    void launch(const std::string& kernel_name, gpuStream stream,
+                std::vector<void*>& args);
 
     gpuDeviceptr get_global(const std::string& name,
                             bool ignore_not_found = false) const;
-    bool is_compiled() const { return function_ != nullptr; }
+    bool is_compiled() const { return !bin_.empty(); }
 
    protected:
     std::shared_ptr<GpuManager> gpu_manager_;
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 342ef9656..10e2410a9 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -8,13 +8,13 @@
 const std::string void_kernel = "extern \"C\" __global__ void kernel() {}";
 
 ark::unittest::State test_gpu_kernel() {
-    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0, "kernel");
+    ark::GpuKernel kernel(0, void_kernel, {1, 1, 1}, {1, 1, 1}, 0);
     UNITTEST_TRUE(!kernel.is_compiled());
     kernel.compile();
     UNITTEST_TRUE(kernel.is_compiled());
     std::vector<void*> args;
     for (int i = 0; i < 10; i++) {
-        kernel.launch(nullptr, args);
+        kernel.launch("kernel", nullptr, args);
     }
     return ark::unittest::SUCCESS;
 }

From 802d84faf2bdff101262d61ebd4cc6992f10d87f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 08:10:22 +0000
Subject: [PATCH 067/106] error handling

---
 ark/api/executor.cpp      | 7 ++++++-
 ark/api/executor_test.cpp | 6 +++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 626fed808..3fcecc12f 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -696,7 +696,12 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
         ERR(InvalidUsageError, "Need to stop before re-compiling.");
         return;
     }
-    init(PlanJson::parse(plan), device_id, name);
+    try {
+        auto plan_json = Json::parse(plan);
+        init(plan_json, device_id, name);
+    } catch (const ::nlohmann::json::parse_error &e) {
+        ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
+    }
     kernel_->compile();
 }
 
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index c8c96fa6d..fd036628f 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -166,8 +166,12 @@ ark::unittest::State test_executor_tensor_read_write_stride_offset() {
 ark::unittest::State test_executor_invalid() {
     ark::Executor exe;
 
+    // Invalid plan.
+    UNITTEST_THROW(exe.compile("not a json", 0), ark::InvalidUsageError);
+
     // Invalid device ID.
-    UNITTEST_THROW(exe.compile("", -1), ark::InvalidUsageError);
+    UNITTEST_THROW(exe.compile(ark::PlanJson().dump(), -1),
+                   ark::InvalidUsageError);
 
     // Invalid rank.
     ark::PlanJson plan;

From 18a391fec8f1b3ac3fa0ddbdd1409f737b89105c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 09:09:21 +0000
Subject: [PATCH 068/106] fix unit test

---
 ark/api/executor_test.cpp | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index fd036628f..cf3495780 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -82,7 +82,7 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     m.noop(tensor);
 
     ark::DefaultExecutor executor(m, 0);
-    executor.launch();
+
     UNITTEST_NE(executor.tensor_address(tensor), nullptr);
 
     // Copy data from CPU array to ARK tensor
@@ -102,20 +102,28 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
         dev_data[i] = -1;
     }
 
+    ark::gpuStream stream;
     UNITTEST_EQ(
-        ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyDeviceToHost),
+        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
         ark::gpuSuccess);
+
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_data.data(), dev_ptr,
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyDeviceToHost, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
+
     for (size_t i = 0; i < dev_data.size(); ++i) {
         UNITTEST_EQ(dev_data[i], static_cast<float>(i));
         dev_data[i] = -1;
     }
 
     // Copy -1s back to GPU array
-    UNITTEST_EQ(
-        ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
-                       ark::gpuMemcpyHostToDevice),
-        ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuMemcpyAsync(dev_ptr, dev_data.data(),
+                                    shape.nelems() * sizeof(float),
+                                    ark::gpuMemcpyHostToDevice, stream),
+                ark::gpuSuccess);
+    UNITTEST_EQ(ark::gpuStreamSynchronize(stream), ark::gpuSuccess);
 
     // Copy data from GPU array to ARK tensor
     executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
@@ -131,10 +139,6 @@ ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
     }
 
     // Provide a stream
-    ark::gpuStream stream;
-    UNITTEST_EQ(
-        ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
-        ark::gpuSuccess);
     executor.tensor_read(tensor, host_data.data(),
                          shape.nelems() * sizeof(float), stream);
     executor.tensor_write(tensor, host_data.data(),

From 7ae0a65f0ce2aba8e28b825e224d4864e8eb012c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 20:25:43 +0000
Subject: [PATCH 069/106] updates

---
 ark/api/executor.cpp        | 6 ++----
 ark/env.cpp                 | 4 ++--
 ark/gpu/gpu_kernel_test.cpp | 1 +
 ark/include/ark/error.hpp   | 2 +-
 python/ark/tensor.py        | 5 ++++-
 5 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 3fcecc12f..162aaa1f0 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -3,14 +3,12 @@
 
 #include "ark/executor.hpp"
 
-#include <dlpack/dlpack.h>
-
 #include <cmath>
+#include <list>
 #include <memory>
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
-#include <tuple>
 
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
@@ -214,7 +212,7 @@ class Executor::Impl {
     std::shared_ptr<CodeGenerator> codegen_;
     std::shared_ptr<GpuEvent> timer_begin_;
     std::shared_ptr<GpuEvent> timer_end_;
-    std::vector<std::shared_ptr<GpuMemory>> buffers_;
+    std::list<std::shared_ptr<GpuMemory>> buffers_;
     std::shared_ptr<GpuHostMemory> flag_;
     std::shared_ptr<GpuStream> stream_;
     std::shared_ptr<GpuKernel> kernel_;
diff --git a/ark/env.cpp b/ark/env.cpp
index d8322378f..f9e7355ff 100644
--- a/ark/env.cpp
+++ b/ark/env.cpp
@@ -10,11 +10,11 @@
 #define DEFAULT_ARK_LOG_LEVEL "INFO"
 #define DEFAULT_ARK_ROOT "/usr/local/ark"
 #define DEFAULT_ARK_TMP "/tmp/ark"
-#define DEFAULT_ARK_KEEP_TMP true
+#define DEFAULT_ARK_KEEP_TMP false
 #define DEFAULT_ARK_HOSTFILE_NAME "hostfile"
 #define DEFAULT_ARK_NUM_RANKS_PER_HOST 8
 #define DEFAULT_ARK_DISABLE_IB false
-#define DEFAULT_ARK_IGNORE_BINARY_CACHE true
+#define DEFAULT_ARK_IGNORE_BINARY_CACHE false
 #define DEFAULT_ARK_ENFORCE_PLAN_PATH ""
 #define DEFAULT_ARK_MSCCLPP_PORT 50051
 
diff --git a/ark/gpu/gpu_kernel_test.cpp b/ark/gpu/gpu_kernel_test.cpp
index 10e2410a9..7b9f7f176 100644
--- a/ark/gpu/gpu_kernel_test.cpp
+++ b/ark/gpu/gpu_kernel_test.cpp
@@ -13,6 +13,7 @@ ark::unittest::State test_gpu_kernel() {
     kernel.compile();
     UNITTEST_TRUE(kernel.is_compiled());
     std::vector<void*> args;
+    UNITTEST_THROW(kernel.launch("", nullptr, args), ark::InvalidUsageError);
     for (int i = 0; i < 10; i++) {
         kernel.launch("kernel", nullptr, args);
     }
diff --git a/ark/include/ark/error.hpp b/ark/include/ark/error.hpp
index 1fbec0c01..965b1c0bc 100644
--- a/ark/include/ark/error.hpp
+++ b/ark/include/ark/error.hpp
@@ -44,4 +44,4 @@ REGISTER_ERROR_TYPE(UnitTestError)
 
 }  // namespace ark
 
-#endif  // ARK_ERROR_HPP
\ No newline at end of file
+#endif  // ARK_ERROR_HPP
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 3fda8b3b3..9211f7d9d 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -162,7 +162,10 @@ def from_torch(tensor: torch.Tensor) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        return Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        ark_tensor = Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        # Share ownership of the memory with the torch tensor
+        ark_tensor.__torch_buffer__ = tensor
+        return ark_tensor
 
     def copy(
         self, data: Union[np.ndarray, torch.Tensor], stream: int = 0

From 4cca6099ed496bdb2850ae5a9a2304143472d570 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 12 Aug 2024 21:01:30 +0000
Subject: [PATCH 070/106] fix unit test

---
 ark/utils/utils_net_test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ark/utils/utils_net_test.cpp b/ark/utils/utils_net_test.cpp
index 4c3b6f162..95dda890c 100644
--- a/ark/utils/utils_net_test.cpp
+++ b/ark/utils/utils_net_test.cpp
@@ -12,6 +12,7 @@ ark::unittest::State test_ipc_hosts() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "127.0.0.1\n127.0.0.1\n127.0.0.1\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_EQ(ark::get_host(0, true), "127.0.0.1");
@@ -31,6 +32,7 @@ ark::unittest::State test_ipc_hosts_unknown_host() {
     auto tmp_hostfile = tmp_dir + "/.test_ipc_hostfile";
     ark::write_file(tmp_hostfile, "unknown\nunknown\nunknown\n");
     ::setenv("ARK_HOSTFILE", tmp_hostfile.c_str(), 1);
+    ::setenv("ARK_KEEP_TMP", "1", 1);
     ark::init();
 
     UNITTEST_THROW(ark::get_host(0, true), ark::InvalidUsageError);

From 45b14b886571aa35f41fa5fc51ad97c3dd0b4ac1 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 14 Aug 2024 10:07:28 +0000
Subject: [PATCH 071/106] fix imports & pytest

---
 .dockerignore                   |  1 +
 .github/workflows/codeql.yml    |  4 ++--
 .github/workflows/ut-cuda.yml   |  8 ++++++--
 python/CMakeLists.txt           |  5 ++++-
 python/ark/__init__.py          | 10 +---------
 python/ark/data_type.py         |  2 +-
 python/ark/error.py             | 16 ++++++++--------
 python/ark/init.py              |  2 +-
 python/ark/model.py             |  2 +-
 python/ark/planner.py           |  2 +-
 python/ark/runtime.py           |  2 +-
 python/ark/tensor.py            |  2 +-
 python/unittest/test.py         |  6 ------
 python/unittest/test_error.py   |  4 ++--
 python/unittest/test_model.py   |  5 ++---
 python/unittest/test_runtime.py |  8 ++++----
 python/unittest/test_tensor.py  |  3 +--
 17 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/.dockerignore b/.dockerignore
index e47f48873..60583dbf9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,6 +6,7 @@ build/
 *.pyc
 *.pyo
 *.pyd
+.pytest_cache/
 
 # Git
 **/.git
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 509ac6d48..7ac2f1649 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 363f1b771..3fa92605e 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug ..
           make -j ut ark_py
 
       - name: Run C++ UT
@@ -71,7 +71,11 @@ jobs:
       - name: Run Python UT
         run: |
           cd build
-          ARK_ROOT=$PWD pytest --cov=../python/ark --cov-report lcov:py_coverage.info --verbose ../python/unittest/test.py
+          PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
+              --cov=../python/ark \
+              --cov-report lcov:py_coverage.info \
+              --verbose \
+              ../python/unittest/test.py
 
       - name: Report Coverage
         env:
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 2e160f8d1..597388e2d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,10 @@ FetchContent_MakeAvailable(pybind11)
 
 file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 pybind11_add_module(ark_py ${BIND_SOURCES})
-set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
+set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ark)
+add_custom_command(TARGET ark_py POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
+)
 target_link_libraries(ark_py PRIVATE ark_static)
 target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
 target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 939c4837f..1aebfa43f 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,15 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from . import _ark_core
 from .model import Model
 
 
@@ -21,11 +18,6 @@ def version():
     return __version__
 
 
-def srand(seed):
-    """Sets the seed for random number generation."""
-    _ark_core.srand(seed)
-
-
 def set_rank(rank):
     """Sets the rank of the current process."""
     Model.set_rank(rank)
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 41c4201c3..8ab982106 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import numpy
-import _ark_core
+from . import _ark_core
 
 try:
     import torch
diff --git a/python/ark/error.py b/python/ark/error.py
index 4ffe6a3f8..cec8ab137 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,14 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _BaseError as BaseError
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _PlanError as PlanError
-from _ark_core import _UnsupportedError as UnsupportedError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
+from ._ark_core import _BaseError as BaseError
+from ._ark_core import _InternalError as InternalError
+from ._ark_core import _InvalidUsageError as InvalidUsageError
+from ._ark_core import _ModelError as ModelError
+from ._ark_core import _PlanError as PlanError
+from ._ark_core import _UnsupportedError as UnsupportedError
+from ._ark_core import _SystemError as SystemError
+from ._ark_core import _GpuError as GpuError
 
 __all__ = [
     "BaseError",
diff --git a/python/ark/init.py b/python/ark/init.py
index a4a67e85d..7daa0771b 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import _ark_core
 from .model import Model
 from .runtime import _RuntimeState
 
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..87af88f49 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from ._ark_core import _Model
 
 _ModelState = NewType("_ModelState", None)
 
diff --git a/python/ark/planner.py b/python/ark/planner.py
index e7eb2e7ed..e5291bbce 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -5,7 +5,7 @@
 import json
 from typing import Callable, Dict, List, Any
 
-from _ark_core import _Planner, _PlannerContext
+from ._ark_core import _Planner, _PlannerContext
 from .model import Model
 
 
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 1490cdeb8..fa953a873 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -4,7 +4,7 @@
 import logging
 from enum import Enum
 
-from _ark_core import _Executor
+from ._ark_core import _Executor
 from .planner import Planner, Plan
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 9211f7d9d..45a54d169 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,7 +4,7 @@
 import numpy as np
 from typing import Callable, List, Union, Type
 
-from _ark_core import _Dims, _Tensor, _NullTensor
+from ._ark_core import _Dims, _Tensor, _NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 from .model import Model
diff --git a/python/unittest/test.py b/python/unittest/test.py
index d56932b83..e8f22fdae 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -1,12 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
-import os
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/..")
-sys.path.insert(0, os.environ.get("ARK_ROOT", ".") + "/python")
-
 from test_error import *
 from test_model import *
 from test_runtime import *
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
index 299e2675e..115dd1a15 100644
--- a/python/unittest/test_error.py
+++ b/python/unittest/test_error.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 
 
+@pytest_ark()
 def test_error():
-    ark.init()
     try:
         ark.tensor([0])
     except ark.BaseError as e:
diff --git a/python/unittest/test_model.py b/python/unittest/test_model.py
index da8ae399a..d65191e54 100644
--- a/python/unittest/test_model.py
+++ b/python/unittest/test_model.py
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 import json
 
 
+@pytest_ark()
 def test_model():
-    ark.init()
-
     input_tensor = ark.tensor([64, 64], ark.fp16)
     other_tensor = ark.tensor([64, 64], ark.fp16)
     ark.add(input_tensor, other_tensor)
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 356430d9a..dd8064d85 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -1,12 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from unittest_common import ark, pytest_ark
 import numpy as np
 
 
+@pytest_ark()
 def test_runtime_relaunch():
-    ark.init()
     with ark.Runtime.get_runtime() as rt:
         assert rt.launched() == False
         rt.launch()
@@ -18,8 +18,8 @@ def test_runtime_relaunch():
         assert rt.launched() == True
 
 
+@pytest_ark()
 def test_runtime_init():
-    ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
     other_tensor = ark.tensor([M, N], ark.fp16)
@@ -51,8 +51,8 @@ def test_runtime_init():
     runtime.reset()
 
 
+@pytest_ark()
 def test_runtime_reuse_plans():
-    ark.init()
     M, N = 64, 64
     input_tensor = ark.tensor([M, N], ark.fp16)
     other_tensor = ark.tensor([M, N], ark.fp16)
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
index 1acad43ee..213264e3b 100644
--- a/python/unittest/test_tensor.py
+++ b/python/unittest/test_tensor.py
@@ -1,8 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from unittest_common import pytest_ark
-import ark
+from unittest_common import ark, pytest_ark
 
 
 @pytest_ark(need_torch=True)

From f7c6867ca0ffe7d5e1b47495d9a0286252270e5a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 15 Aug 2024 09:02:54 +0000
Subject: [PATCH 072/106] fix codecov

---
 .github/workflows/ut-cuda.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 3fa92605e..10b0679da 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -72,7 +72,7 @@ jobs:
         run: |
           cd build
           PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
-              --cov=../python/ark \
+              --cov=python/ark \
               --cov-report lcov:py_coverage.info \
               --verbose \
               ../python/unittest/test.py

From 66b78a729d3c71e8a0dff1dfcd0ced1f1afa35a4 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 18 Aug 2024 21:03:10 -0700
Subject: [PATCH 073/106] Add placeholder operator (#239)

Co-authored-by: Noli Gerawork <ngerawor@andrew.cmu.edu>
---
 ark/api/executor.cpp                        | 199 ++++++++++++--------
 ark/api/tensor.cpp                          |  12 --
 ark/codegen.cpp                             |  60 +++---
 ark/codegen.hpp                             |   6 +-
 ark/cpu_timer.cpp                           |  16 --
 ark/cpu_timer.h                             |   4 -
 ark/external_buffer_registry.cpp            |  29 +++
 ark/external_buffer_registry.hpp            |  31 +++
 ark/gpu/gpu.hpp                             |   4 +
 ark/include/ark/executor.hpp                |  21 ++-
 ark/include/ark/model.hpp                   |  32 +++-
 ark/include/ark/tensor.hpp                  |  11 +-
 ark/model/model_buffer.cpp                  |  62 ++----
 ark/model/model_buffer.hpp                  |  22 +--
 ark/model/model_op.cpp                      |   2 +
 ark/model_buffer_manager.hpp                |  57 ------
 ark/ops/ops_placeholder.cpp                 |  49 +++++
 ark/ops/ops_placeholder.hpp                 |  23 +++
 ark/ops/ops_placeholder_test.cpp            | 103 ++++++++++
 python/ark/data_type.py                     |   5 +-
 python/ark/model.py                         |  17 ++
 python/ark/module.py                        |  33 ++--
 python/ark/ops.py                           |  77 ++++----
 python/ark/runtime.py                       |  41 +++-
 python/ark/tensor.py                        | 124 ++++++++++--
 python/ark/torch/__init__.py                |  11 ++
 python/ark/{torch_mock.py => torch/mock.py} |   0
 python/executor_py.cpp                      |  45 ++++-
 python/model_py.cpp                         |  82 +++++++-
 python/tensor_py.cpp                        |  71 -------
 python/unittest/test_conversion.py          |  98 ++++++++--
 31 files changed, 887 insertions(+), 460 deletions(-)
 create mode 100644 ark/external_buffer_registry.cpp
 create mode 100644 ark/external_buffer_registry.hpp
 delete mode 100644 ark/model_buffer_manager.hpp
 create mode 100644 ark/ops/ops_placeholder.cpp
 create mode 100644 ark/ops/ops_placeholder.hpp
 create mode 100644 ark/ops/ops_placeholder_test.cpp
 create mode 100644 python/ark/torch/__init__.py
 rename python/ark/{torch_mock.py => torch/mock.py} (100%)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 162aaa1f0..50ec4c629 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -9,12 +9,14 @@
 #include <mscclpp/core.hpp>
 #include <mscclpp/proxy_channel.hpp>
 #include <mscclpp/sm_channel.hpp>
+#include <utility>
 
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
 #include "codegen.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "gpu/gpu.hpp"
 #include "gpu/gpu_event.hpp"
@@ -25,7 +27,6 @@
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_tensor.hpp"
-#include "model_buffer_manager.hpp"
 #include "utils/utils_net.hpp"
 
 #if defined(ARK_CUDA)
@@ -160,8 +161,10 @@ class Executor::Impl {
 
     void compile(const std::string &plan, int device_id,
                  const std::string &name);
-    void launch(Stream stream, bool loop_mode);
-    void run(int iter);
+    void launch(Stream stream, bool loop_mode,
+                const std::unordered_map<Tensor, void *> &placeholder_data);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &placeholder_data);
     void wait(int64_t max_spin_count);
     float stop(int64_t max_spin_count);
     void barrier();
@@ -183,6 +186,10 @@ class Executor::Impl {
     void init(const PlanJson &plan_json, int device_id,
               const std::string &name);
     void init_communicator();
+    bool add_kernel_arg(size_t buf_id, bool is_external);
+    std::vector<void *> add_kernel_addr(
+        const std::unordered_map<Tensor, void *> &placeholder_data);
+
     std::map<size_t, size_t> init_buffers(const Json &plan_json);
     std::map<size_t, void *> init_buffer_addrs(
         std::shared_ptr<GpuMemory> buffer,
@@ -203,9 +210,7 @@ class Executor::Impl {
     bool is_recording_ = false;
     float elapsed_msec_ = -1;
 
-    std::vector<void *> external_buffers_;
-    std::vector<std::string> external_args_;
-    std::map<size_t, std::string> buffer_id_to_name_;
+    std::map<size_t, std::pair<std::string, void *>> buffer_id_to_kernel_arg_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     std::map<size_t, void *> buffer_id_to_addr_;
     size_t total_bytes_;
@@ -239,10 +244,8 @@ void Executor::Impl::init(const PlanJson &plan_json, int device_id,
     plan_json_ = plan_json;
     device_id_ = device_id;
     name_ = name;
-
-    external_buffers_.clear();
-    external_args_.clear();
-    buffer_id_to_name_.clear();
+    buffer_id_to_offset_.clear();
+    buffer_id_to_kernel_arg_.clear();
     total_bytes_ = 0;
 
     rank_ = plan_json_["Rank"].get<int>();
@@ -283,8 +286,7 @@ void Executor::Impl::init(const PlanJson &plan_json, int device_id,
     }
 
     codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
-                                               external_args_,
-                                               buffer_id_to_name_, name_);
+                                               buffer_id_to_kernel_arg_);
 
     flag_ = gpu_manager->malloc_host(
         sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
@@ -323,12 +325,76 @@ std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
     if (!buffer_id_to_addr_.empty()) {
         buffer_id_to_addr = buffer_id_to_addr_;
     }
-    for (const auto &kv : buffer_id_to_offset) {
-        buffer_id_to_addr[kv.first] = buffer->ref(kv.second);
+    for (const auto &[id, offset] : buffer_id_to_offset) {
+        buffer_id_to_addr[id] = buffer->ref(offset);
     }
     return buffer_id_to_addr;
 }
 
+bool Executor::Impl::add_kernel_arg(size_t buf_id, bool is_external) {
+    bool reused_buffer =
+        buffer_id_to_addr_.find(buf_id) != buffer_id_to_addr_.end();
+    if (!is_external && !reused_buffer) {
+        return false;
+    }
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    const std::string name = "extern_buf_" + std::to_string(buf_id);
+    if (reused_buffer) {
+        // The buffer is being reused from a previous plan
+        void *buf_addr = buffer_id_to_addr_[buf_id];
+        buffer_id_to_kernel_arg_[buf_id] = std::make_pair(name, buf_addr);
+    } else {
+        // The buffer is external (can either be staged/non-staged)
+        buffer_id_to_kernel_arg_[buf_id] =
+            std::make_pair(name, ext_buf_reg.get(buf_id));
+    }
+
+    return true;
+}
+
+std::vector<void *> Executor::Impl::add_kernel_addr(
+    const std::unordered_map<Tensor, void *> &placeholder_data) {
+    std::unordered_map<size_t, void *> buffer_id_to_placeholder;
+    for (const auto &[tensor, ptr] : placeholder_data) {
+        buffer_id_to_placeholder[tensor.ref()->buffer()->id()] = ptr;
+    }
+
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    std::vector<void *> kernel_arg_addrs;
+    kernel_arg_addrs.reserve(buffer_id_to_kernel_arg_.size());
+
+    for (const auto &[buf_id, _] : buffer_id_to_kernel_arg_) {
+        void *buf_addr = nullptr;
+        // Check for reused tensor
+        if (auto it = buffer_id_to_addr_.find(buf_id);
+            it != buffer_id_to_addr_.end()) {
+            buf_addr = it->second;
+        }
+        // Check for external tensor (non-staged)
+        else if (void *ext_buf_addr = ext_buf_reg.get(buf_id);
+                 ext_buf_addr != nullptr) {
+            buf_addr = ext_buf_addr;
+        }
+        // Check for external tensor (staged)
+        else if (auto it = buffer_id_to_placeholder.find(buf_id);
+                 it != buffer_id_to_placeholder.end()) {
+            buf_addr = it->second;
+        }
+        if (buf_addr == nullptr) {
+            ERR(InvalidUsageError, "Buffer with id ", buf_id,
+                " did not receive initializing data.");
+        }
+        gpuPointerAttributes attr;
+        GLOG(gpuPointerGetAttributes(&attr, buf_addr));
+        if (attr.device != device_id_) {
+            ERR(InvalidUsageError, "Data for buffer id ", buf_id,
+                " is on a different GPU: ", attr.device, " vs ", device_id_);
+        }
+        kernel_arg_addrs.push_back(buf_addr);
+    }
+    return kernel_arg_addrs;
+}
+
 std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     class BufferInfo {
        public:
@@ -403,50 +469,30 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     std::map<int, std::map<int, size_t>> remote_rank_to_send_tag_to_buffer_id;
     std::map<int, std::map<int, size_t>> remote_rank_to_recv_tag_to_buffer_id;
 
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+
     // TODO: improve memory planning
     size_t offset = 0;
     for (auto &kv : buffer_id_to_info) {
         auto &buf_info = kv.second;
         int r = buf_info->buffer->rank();
+        const size_t buf_id = buf_info->buffer->id();
         if (r != rank_ && r != -1) {
             // this is a remote buffer
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
+                                                    [tag_info.second] = buf_id;
             }
             for (const auto &tag_info : buf_info->buffer->recv_tags()) {
                 remote_rank_to_recv_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] =
-                                                        buf_info->buffer->id();
-            }
-            continue;
-        }
-        if (buf_info->buffer->is_external()) {
-            if (buf_info->buffer->device_id() != device_id_) {
-                ERR(InvalidUsageError,
-                    "PyTorch tensor and model execution are on different GPUs");
+                                                    [tag_info.second] = buf_id;
             }
-            external_buffers_.push_back(buf_info->buffer->external_data());
-            const auto [it, inserted] = buffer_id_to_name_.try_emplace(
-                buf_info->buffer->id(),
-                "extern_buf_" + std::to_string(buf_info->buffer->id()));
-            external_args_.push_back(it->second);
             continue;
         }
-        // if we are adding a plan and come across a buffer from a previous
-        // plan, we utilize the buffer offset from the previous plan
-        if (buffer_id_to_offset_.find(buf_info->buffer->id()) !=
-            buffer_id_to_offset_.end()) {
-            external_buffers_.push_back(
-                buffer_id_to_addr_[buf_info->buffer->id()]);
-            const std::string name =
-                "extern_buf_" + std::to_string(buf_info->buffer->id());
-            external_args_.push_back(name);
-            buffer_id_to_name_[buf_info->buffer->id()] = name;
+        if (add_kernel_arg(buf_id, buf_info->buffer->is_external())) {
             continue;
         } else {
-            buffer_id_to_offset[buf_info->buffer->id()] = offset;
+            buffer_id_to_offset[buf_id] = offset;
             for (const auto &tag_info : buf_info->buffer->send_tags()) {
                 remote_rank_to_send_tags_and_offsets[tag_info.first]
                     .first.push_back(tag_info.second);
@@ -536,8 +582,10 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
                 buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -556,8 +604,10 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            if (!buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]
-                     ->buffer->is_external()) {
+            const size_t buf_id =
+                buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
+            void *buf_data = ext_buf_reg.get(buf_id);
+            if (buf_data == nullptr) {
                 buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
                     offsets[i];
             }
@@ -703,7 +753,9 @@ void Executor::Impl::compile(const std::string &plan, int device_id,
     kernel_->compile();
 }
 
-void Executor::Impl::launch(Stream stream, bool loop_mode) {
+void Executor::Impl::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &placeholder_data) {
     if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
@@ -720,15 +772,10 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     loop_mode_ = loop_mode;
 
     if (loop_mode_) {
-        // should we add an identifier to specify which plan the kernel executes
-        // i.e. ark_loop_kernel_2 for the second plan
         kernel_name_ = "ark_loop_kernel";
     } else {
         kernel_name_ = "ark_kernel";
     }
-    if (!name_.empty()) {
-        kernel_name_ += "_" + name_;
-    }
 
     auto get_global_rt = [&](const std::string &symbol) {
         return reinterpret_cast<void *>(kernel_->get_global(symbol));
@@ -787,8 +834,9 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
         void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         void *flag_ptr = flag_->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
-        for (auto &buffer : external_buffers_) {
-            args.push_back(&buffer);
+        auto addr_args = add_kernel_addr(placeholder_data);
+        for (auto &ptr : addr_args) {
+            args.push_back(&ptr);
         }
         kernel_->launch(kernel_name_, stream_raw_, args);
     }
@@ -796,7 +844,8 @@ void Executor::Impl::launch(Stream stream, bool loop_mode) {
     is_launched_ = true;
 }
 
-void Executor::Impl::run(int iter) {
+void Executor::Impl::run(
+    int iter, const std::unordered_map<Tensor, void *> &placeholder_data) {
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
@@ -806,8 +855,9 @@ void Executor::Impl::run(int iter) {
         void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
-        for (auto &buffer : external_buffers_) {
-            args.push_back(&buffer);
+        auto addr_arg = add_kernel_addr(placeholder_data);
+        for (auto &ptr : addr_arg) {
+            args.push_back(&ptr);
         }
         for (; i < iter; i++) {
             kernel_->launch(kernel_name_, stream_raw_, args);
@@ -876,6 +926,11 @@ void Executor::Impl::barrier() {
 
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
+    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    void *ext_data = ext_buf_reg.get(buffer_id);
+    if (ext_data) {
+        return ext_data;
+    }
     if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
         ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
             ". This is likely caused by accessing a tensor that is optimized "
@@ -888,11 +943,6 @@ void *Executor::Impl::tensor_address(const Tensor &tensor) const {
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
-        ERR(InvalidUsageError,
-            "Reading data from a tensor preallocated by PyTorch is not "
-            "supported. Use PyTorch's native methods.");
-    }
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -944,11 +994,6 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
     GLOG(gpuSetDevice(device_id_));
-    if (tensor.ref()->buffer()->is_external()) {
-        ERR(InvalidUsageError,
-            "Writing data to a tensor preallocated by PyTorch is not "
-            "supported. Use PyTorch's native methods.");
-    }
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -1019,11 +1064,16 @@ void Executor::compile(const std::string &plan, int device_id,
     impl_->compile(plan, device_id, name);
 }
 
-void Executor::launch(Stream stream, bool loop_mode) {
-    impl_->launch(stream, loop_mode);
+void Executor::launch(
+    Stream stream, bool loop_mode,
+    const std::unordered_map<Tensor, void *> &placeholder_data) {
+    impl_->launch(stream, loop_mode, placeholder_data);
 }
 
-void Executor::run(int iter) { impl_->run(iter); }
+void Executor::run(int iter,
+                   const std::unordered_map<Tensor, void *> &placeholder_data) {
+    impl_->run(iter, placeholder_data);
+}
 
 void Executor::wait(int64_t max_spin_count) { impl_->wait(max_spin_count); }
 
@@ -1033,10 +1083,7 @@ float Executor::stop(int64_t max_spin_count) {
 
 void Executor::barrier() { impl_->barrier(); }
 
-void Executor::destroy() {
-    ModelBufferManager::get_instance().clear_buffers();
-    impl_.reset(nullptr);
-}
+void Executor::destroy() { impl_.reset(nullptr); }
 
 bool Executor::destroyed() const { return impl_.get() == nullptr; }
 
@@ -1070,8 +1117,10 @@ DefaultExecutor::DefaultExecutor(
     impl_->loop_mode_ = loop_mode;
 }
 
-void DefaultExecutor::launch() {
-    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_), impl_->loop_mode_);
+void DefaultExecutor::launch(
+    const std::unordered_map<Tensor, void *> &placeholder_data) {
+    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
+                     impl_->loop_mode_, placeholder_data);
 }
 
 }  // namespace ark
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index 084ce6383..fc44b4a58 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -9,18 +9,6 @@
 
 namespace ark {
 
-Tensor::Tensor(void* data_ptr, int32_t device_id,
-               const std::vector<int64_t>& shape, const DataType& dtype) {
-    size_t external_data_size = std::accumulate(shape.begin(), shape.end(), 1,
-                                                std::multiplies<int64_t>()) *
-                                dtype.bytes();
-    auto buffer =
-        std::make_shared<ModelBuffer>(data_ptr, external_data_size, device_id);
-    auto tensor = std::make_shared<ModelTensor>(
-        dtype.ref(), buffer, Dims(shape), Dims(shape), Dims(), Dims());
-    ref_ = tensor;
-}
-
 size_t Tensor::id() const {
     if (ref_) {
         return ref_->id();
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 2bd36d679..04c5887fc 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -4,16 +4,17 @@
 #include "codegen.hpp"
 
 #include <list>
+#include <utility>
 
 #include "ark/data_type.hpp"
 #include "env.h"
+#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
 #include "model/model_data_type.hpp"
 #include "model/model_op.hpp"
 #include "model/model_tensor.hpp"
-#include "model_buffer_manager.hpp"
 #include "range.hpp"
 #include "utils/utils_math.hpp"
 
@@ -55,8 +56,8 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::vector<std::string> &external_args,
-         const std::map<size_t, std::string> &buffer_id_to_name,
+         const std::map<size_t, std::pair<std::string, void *>>
+             &buffer_id_to_kernel_arg,
          const std::string &name);
     ~Impl() = default;
 
@@ -82,8 +83,7 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
-    std::vector<std::string> external_args_;
-    std::map<size_t, std::string> buffer_id_to_name_;
+    std::map<size_t, std::pair<std::string, void *>> buffer_id_to_kernel_arg_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -92,14 +92,13 @@ class CodeGenerator::Impl {
     std::string code_;
 };
 
-CodeGenerator::Impl::Impl(
-    const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::vector<std::string> &external_args,
-    const std::map<size_t, std::string> &buffer_id_to_name,
-    const std::string &name)
+CodeGenerator::Impl::Impl(const PlanJson &plan,
+                          const std::map<size_t, size_t> &buffer_id_to_offset,
+                          const std::map<size_t, std::pair<std::string, void *>>
+                              &buffer_id_to_kernel_arg,
+                          const std::string &name)
     : buffer_id_to_offset_(buffer_id_to_offset),
-      external_args_(external_args),
-      buffer_id_to_name_(buffer_id_to_name),
+      buffer_id_to_kernel_arg_(buffer_id_to_kernel_arg),
       name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
@@ -192,9 +191,10 @@ CodeGenerator::Impl::Impl(
 
     // Generate the global arguments
     std::stringstream global_args_ss, function_args_ss, arg_types_ss;
-    for (const auto &arg : external_args_) {
-        global_args_ss << "void *" << arg << ", ";
-        function_args_ss << arg << ", ";
+    for (const auto &[buf_id, kernel_arg] : buffer_id_to_kernel_arg_) {
+        const auto &arg_name = kernel_arg.first;
+        global_args_ss << "void *" << arg_name << ", ";
+        function_args_ss << arg_name << ", ";
         arg_types_ss << "void *, ";
     }
     std::string global_args = global_args_ss.str();
@@ -219,7 +219,7 @@ CodeGenerator::Impl::Impl(
         {"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
         {"@DEFINITIONS@", definitions_ss.str()},
         {"@BODY@", body_ss.str()},
-        {"@NAME@", (name_.empty() ? "" : "_" + name_)},
+        {"@NAME@", (!name_.empty() ? "" : name_)},
         {"@GLOBAL_ARGS@", global_args},
         {"@FUNCTION_ARGS@", function_args},
         {"@ARG_TYPES@", arg_types},
@@ -273,29 +273,28 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
                 size_t buffer_id = tns->buffer()->id();
-                if (buffer_id_to_name_.find(buffer_id) ==
-                    buffer_id_to_name_.end()) {
+                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
+                if (it == buffer_id_to_kernel_arg_.end()) {
                     size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + ModelOffset(tns).value();
                     ss << "(" << tns->data_type()->type_str() << "*)&_buf["
                        << offset << "]";
                 } else {
-                    ss << "(" << tns->data_type()->type_str() << "*)"
-                       << buffer_id_to_name_.at(buffer_id);
+                    const auto &name = it->second.first;
+                    ss << "(" << tns->data_type()->type_str() << "*)" << name;
                 }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_id = moff.buffer_id();
-                if (buffer_id_to_name_.find(buffer_id) ==
-                    buffer_id_to_name_.end()) {
+                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
+                if (it == buffer_id_to_kernel_arg_.end()) {
                     size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
                     size_t offset = buffer_offset + moff.value();
                     ss << offset;
                 } else {
-                    const std::string &buffer_name =
-                        buffer_id_to_name_.at(buffer_id);
+                    const auto &name = it->second.first;
                     size_t offset = moff.value();
-                    ss << "(uint64_t)((char*)" << buffer_name << " + " << offset
+                    ss << "(uint64_t)((char*)" << name << " + " << offset
                        << ")";
                 }
             } else {
@@ -372,8 +371,7 @@ std::string CodeGenerator::Impl::resource_group(
             n_slots = total_warps / num_warps_per_task;
         }
         if (n_slots == 0) {
-            ERR(PlanError, "not enough resources for task group: ",
-                tg.dump());
+            ERR(PlanError, "not enough resources for task group: ", tg.dump());
         }
 
         size_t task_b = *task_range.begin();
@@ -498,11 +496,11 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::vector<std::string> &external_args,
-    const std::map<size_t, std::string> &buffer_id_to_name,
+    const std::map<size_t, std::pair<std::string, void *>>
+        &buffer_id_to_kernel_arg,
     const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, external_args,
-                                   buffer_id_to_name, name)) {}
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset,
+                                   buffer_id_to_kernel_arg, name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 8a4eed270..0fccc46e3 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -7,9 +7,9 @@
 #include <map>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "model/model_json.hpp"
-#include "model_buffer_manager.hpp"
 
 namespace ark {
 
@@ -17,8 +17,8 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::vector<std::string> &external_args,
-                  const std::map<size_t, std::string> &buffer_id_to_name,
+                  const std::map<size_t, std::pair<std::string, void *>>
+                      &buffer_id_to_kernel_arg,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
diff --git a/ark/cpu_timer.cpp b/ark/cpu_timer.cpp
index c740de5f3..129ba7bd2 100644
--- a/ark/cpu_timer.cpp
+++ b/ark/cpu_timer.cpp
@@ -16,20 +16,4 @@ double cpu_timer(void) {
     return (tspec.tv_nsec / 1.0e9) + tspec.tv_sec;
 }
 
-// Sleep in second.
-int cpu_timer_sleep(double sec) {
-    struct timespec tspec;
-    tspec.tv_sec = (time_t)sec;
-    tspec.tv_nsec = (long)((sec - tspec.tv_sec) * 1.0e9);
-    return nanosleep(&tspec, 0);
-}
-
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec) {
-    struct timespec tspec;
-    tspec.tv_sec = 0;
-    tspec.tv_nsec = nsec;
-    return nanosleep(&tspec, 0);
-}
-
 }  // namespace ark
diff --git a/ark/cpu_timer.h b/ark/cpu_timer.h
index 52bf63d92..eaac94061 100644
--- a/ark/cpu_timer.h
+++ b/ark/cpu_timer.h
@@ -8,10 +8,6 @@ namespace ark {
 
 // Measure current time in second.
 double cpu_timer(void);
-// Sleep in second.
-int cpu_timer_sleep(double sec);
-// Sleep in nanosecond.
-int cpu_ntimer_sleep(long nsec);
 
 }  // namespace ark
 
diff --git a/ark/external_buffer_registry.cpp b/ark/external_buffer_registry.cpp
new file mode 100644
index 000000000..912050d0d
--- /dev/null
+++ b/ark/external_buffer_registry.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "external_buffer_registry.hpp"
+
+#include "logging.hpp"
+
+namespace ark {
+
+ExternalBufferRegistry &ExternalBufferRegistry::get_instance() {
+    static ExternalBufferRegistry instance;
+    return instance;
+}
+
+void ExternalBufferRegistry::set(const size_t id, void *data) {
+    buffers_[id] = data;
+}
+
+void *ExternalBufferRegistry::get(const size_t id) const {
+    auto it = buffers_.find(id);
+    if (it != buffers_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+void ExternalBufferRegistry::clear() { buffers_.clear(); }
+
+}  // namespace ark
diff --git a/ark/external_buffer_registry.hpp b/ark/external_buffer_registry.hpp
new file mode 100644
index 000000000..ab199bafc
--- /dev/null
+++ b/ark/external_buffer_registry.hpp
@@ -0,0 +1,31 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+#define ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
+
+#include <unordered_map>
+
+namespace ark {
+// Manages externally allocated buffers (buffers corresponding to Tensors that
+// are the output of a `placeholder` operation) outside of ARK's memory space.
+class ExternalBufferRegistry {
+   public:
+    static ExternalBufferRegistry &get_instance();
+
+    void set(const size_t id, void *data);
+
+    void *get(const size_t id) const;
+
+    void clear();
+
+   private:
+    // Maps buffer IDs to pointers and sizes.
+    std::unordered_map<size_t, void *> buffers_;
+    ExternalBufferRegistry() {}
+    ExternalBufferRegistry(const ExternalBufferRegistry &) = delete;
+    ExternalBufferRegistry &operator=(const ExternalBufferRegistry &) = delete;
+};
+}  // namespace ark
+
+#endif  // ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp
index 531d6c7ee..8ff3b2843 100644
--- a/ark/gpu/gpu.hpp
+++ b/ark/gpu/gpu.hpp
@@ -53,6 +53,8 @@ ARK_GPU_DEFINE_TYPE_ALIAS(gpuModule, CUmodule, hipModule_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunction, CUfunction, hipFunction_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunctionAttribute, CUfunction_attribute,
                           hipFunction_attribute);
+ARK_GPU_DEFINE_TYPE_ALIAS(gpuPointerAttributes, cudaPointerAttributes,
+                          hipPointerAttributes);
 
 // runtime API
 ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuSuccess, cudaSuccess, hipSuccess);
@@ -126,6 +128,8 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
                           hipGetErrorString);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
+ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerGetAttributes, cudaPointerGetAttributes,
+                          hipPointerGetAttributes);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
                           hipDeviceGetAttribute);
 ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index 8e6577cd2..fafc9066c 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -9,6 +9,7 @@
 #include <ark/tensor.hpp>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 namespace ark {
@@ -45,10 +46,14 @@ class Executor {
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(Stream stream = nullptr, bool loop_mode = true);
+    void launch(
+        Stream stream = nullptr, bool loop_mode = true,
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 
     /// Run the executor for `iter` iterations.
-    void run(int iter);
+    void run(
+        int iter,
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 
     /// Wait for the previous run to finish.
     void wait(int64_t max_spin_count = -1);
@@ -99,13 +104,15 @@ class Model;
 
 class DefaultExecutor : public Executor {
    public:
-    DefaultExecutor(
-        const Model &model, int device_id = -1, Stream stream = nullptr,
-        const std::vector<Planner::ConfigRule> &config_rules = {},
-        const std::string &name = "DefaultExecutor", bool loop_mode = true);
+    DefaultExecutor(const Model &model, int device_id = -1,
+                    Stream stream = nullptr,
+                    const std::vector<Planner::ConfigRule> &config_rules = {},
+                    const std::string &name = "DefaultExecutor",
+                    bool loop_mode = true);
 
     /// Launch the default executor.
-    void launch();
+    void launch(
+        const std::unordered_map<Tensor, void *> &placeholder_data = {});
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/model.hpp b/ark/include/ark/model.hpp
index 3c4f22e22..e1b1f462b 100644
--- a/ark/include/ark/model.hpp
+++ b/ark/include/ark/model.hpp
@@ -76,6 +76,37 @@ class Model : public ModelGraph {
                   const Dims &padded_shape = {}, int rank = -1,
                   const std::string &name = "");
 
+    ///
+    /// Returns a tensor object associated with an external buffer.
+    ///
+    /// @param shape Shape of the tensor, where the data of interest is.
+    /// @param dtype Type of the tensor data.
+    /// @param strides Strides of each dimension of the tensor, which may be
+    /// different from the shape. @p strides can be considered as the actual
+    /// shape of the underlying data buffer.
+    /// @param offsets Offsets of the tensor. The data of interest starts at
+    /// @p offsets and ends at @p offsets + @p padded_shape.
+    /// @param padded_shape Padded shape of the tensor. Padding is used to
+    /// reserve extra space for the tensor when computation requires it.
+    /// Data on the padded region is allowed to be accessed by computation,
+    /// but it is not considered as the data of interest. The padded region is
+    /// initialized to zero only once when the Executor is launched. The padded
+    /// shape should be greater than or equal to the @p shape, and the
+    /// @p strides should be greater than or equal to the padded shape. If the
+    /// @p strides are not provided, they are set to the padded shape. If the
+    /// padded shape is not provided, it is set to the @p shape.
+    /// @param rank Rank of the tensor. -1 means the rank of this model.
+    /// @param name Name of the tensor.
+    /// @param data Address of data to pass through placeholder. If provided,
+    /// this buffer is registered with the ExternalBufferRegistry and associated
+    /// with the tensor.
+    /// @return Pointer to a tensor object that references the external buffer.
+    ///
+    Tensor placeholder(const Dims &shape, const DataType &data_type,
+                       const Dims &strides = {}, const Dims &offsets = {},
+                       const Dims &padded_shape = {}, int rank = -1,
+                       void *data = nullptr, const std::string &name = "");
+
     Tensor refer(Tensor input, const Dims &shape = {}, const Dims &strides = {},
                  const Dims &offsets = {}, const Dims &padded_shape = {},
                  const std::string &name = "");
@@ -254,7 +285,6 @@ class Model : public ModelGraph {
 
     Tensor local_all_reduce(Tensor input, int gpu_id, int gpu_num,
                             const std::string &name = "");
-
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index 5e463f99d..c2d9dbe94 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -31,8 +31,6 @@ class Tensor {
     Tensor(ModelTensorRef ref) : ref_(ref) {}
     Tensor(const Tensor &other) = default;
     Tensor &operator=(const Tensor &other) = default;
-    Tensor(void *data_ptr, int32_t device_id, const std::vector<int64_t> &shape,
-           const DataType &dtype);
 
     bool operator==(const Tensor &other) const { return ref_ == other.ref_; }
     bool operator!=(const Tensor &other) const { return ref_ != other.ref_; }
@@ -62,4 +60,13 @@ std::ostream &operator<<(std::ostream &os, const Tensor &tensor);
 
 }  // namespace ark
 
+namespace std {
+template <>
+struct hash<ark::Tensor> {
+    size_t operator()(const ark::Tensor &t) const noexcept {
+        return t.id();
+    }
+};
+}  // namespace std
+
 #endif  // ARK_TENSOR_HPP
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 5ce255ce5..5e2409537 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -3,19 +3,22 @@
 
 #include "model_buffer.hpp"
 
+#include "external_buffer_registry.hpp"
 #include "logging.hpp"
-#include "model_buffer_manager.hpp"
 
 namespace ark {
 
 size_t ModelBuffer::curr_id = 0;
 
-ModelBuffer::ModelBuffer(int rank) : rank_(rank) { id_ = curr_id++; }
+ModelBuffer::ModelBuffer(int rank, bool is_external)
+    : rank_(rank), is_external_(is_external) {
+    id_ = curr_id++;
+}
 
-ModelBuffer::ModelBuffer(size_t id, int rank,
+ModelBuffer::ModelBuffer(size_t id, int rank, bool is_external,
                          const std::vector<TagInfo> &send_tags,
                          const std::vector<TagInfo> &recv_tags)
-    : id_(id), rank_(rank) {
+    : id_(id), rank_(rank), is_external_(is_external) {
     for (const auto &info : send_tags) {
         send_tags_.insert(info);
     }
@@ -24,23 +27,6 @@ ModelBuffer::ModelBuffer(size_t id, int rank,
     }
 }
 
-ModelBuffer::ModelBuffer(void *data, size_t size, int32_t device_id)
-    : rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {
-    id_ = curr_id++;
-}
-
-ModelBuffer::ModelBuffer(size_t id, void *data, size_t size, int32_t device_id)
-    : id_(id),
-      rank_(-1),
-      external_data_(data),
-      external_data_size_(size),
-      device_id_(device_id),
-      is_external_(true) {}
-
 void ModelBuffer::tag_send(int remote_rank, int tag) {
     send_tags_.insert(TagInfo{remote_rank, tag});
 }
@@ -61,16 +47,9 @@ Json ModelBuffer::serialize() const {
     for (const auto &info : recv_tags_) {
         recv_tags.push_back({info.first, info.second});
     }
+    j["IsExternal"] = is_external_;
     j["SendTags"] = send_tags;
     j["RecvTags"] = recv_tags;
-    j["IsExternal"] = is_external_;
-    if (is_external_) {
-        ModelBufferManager::get_instance().register_buffer(id_, external_data_,
-                                                           external_data_size_);
-        j["ExternalDataSize"] = external_data_size_;
-        j["DeviceId"] = device_id_;
-    }
-    // external_data_ptr_ is not included in JSON
     return j;
 }
 
@@ -88,28 +67,9 @@ std::shared_ptr<ModelBuffer> ModelBuffer::deserialize(const Json &serialized) {
         ERR(ModelError,
             "ModelBuffer deserialization failed: missing IsExternal");
     }
-    if (serialized["IsExternal"]) {
-        if (!serialized.contains("ExternalDataSize")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing ExternalDataSize");
-        } else if (!serialized.contains("DeviceId")) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: missing DeviceId");
-        }
-        void *data_ptr =
-            ModelBufferManager::get_instance().get_buffer(serialized["Id"]);
-        if (!data_ptr) {
-            ERR(ModelError,
-                "ModelBuffer deserialization failed: external buffer not found "
-                "in BufferManager");
-        }
-        return std::make_shared<ModelBuffer>(serialized["Id"], data_ptr,
-                                             serialized["ExternalDataSize"],
-                                             serialized["DeviceId"]);
-    }
-    return std::make_shared<ModelBuffer>(serialized["Id"], serialized["Rank"],
-                                         serialized["SendTags"],
-                                         serialized["RecvTags"]);
+    return std::make_shared<ModelBuffer>(
+        serialized["Id"], serialized["Rank"], serialized["IsExternal"],
+        serialized["SendTags"], serialized["RecvTags"]);
 }
 
 }  // namespace ark
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index e7f1045b2..8b66356b1 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -17,19 +17,18 @@ class ModelBuffer {
     // (remote_rank, tag)
     using TagInfo = std::pair<int, int>;
 
-    ModelBuffer(int rank = -1);
+    ModelBuffer(int rank = -1, bool is_external = false);
 
-    ModelBuffer(size_t id, int rank, const std::vector<TagInfo> &send_tags,
+    ModelBuffer(size_t id, int rank, bool is_external,
+                const std::vector<TagInfo> &send_tags,
                 const std::vector<TagInfo> &recv_tags);
 
-    // externally managed buffer
-    ModelBuffer(void *data, size_t size, int32_t device_id);
-    ModelBuffer(size_t id, void *data, size_t size, int32_t device_id);
-
     size_t id() const { return id_; }
 
     int rank() const { return rank_; }
 
+    bool is_external() const { return is_external_; }
+
     const std::set<TagInfo> &send_tags() const { return send_tags_; }
 
     const std::set<TagInfo> &recv_tags() const { return recv_tags_; }
@@ -48,22 +47,13 @@ class ModelBuffer {
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
 
-    // external buffer management
-    size_t external_data_size() const { return external_data_size_; }
-    void *external_data() const { return external_data_; }
-    int32_t device_id() const { return device_id_; }
-    bool is_external() const { return is_external_; }
-
    private:
     static size_t curr_id;
     size_t id_;
     int rank_;
+    bool is_external_;
     std::set<TagInfo> send_tags_;
     std::set<TagInfo> recv_tags_;
-    void *external_data_ = nullptr;
-    size_t external_data_size_ = 0;
-    int32_t device_id_;
-    bool is_external_ = false;
 };
 
 }  // namespace ark
diff --git a/ark/model/model_op.cpp b/ark/model/model_op.cpp
index 5db8576e8..8f222b75d 100644
--- a/ark/model/model_op.cpp
+++ b/ark/model/model_op.cpp
@@ -16,6 +16,7 @@
 #include "ops/ops_math.hpp"
 #include "ops/ops_matmul.hpp"
 #include "ops/ops_noop.hpp"
+#include "ops/ops_placeholder.hpp"
 #include "ops/ops_reduce.hpp"
 #include "ops/ops_refer.hpp"
 #include "ops/ops_reshape.hpp"
@@ -78,6 +79,7 @@ const ModelOpType ModelOpT::from_name(const std::string &type_name) {
         MODEL_OP_TYPE_REGISTER(Sqrt);
         MODEL_OP_TYPE_REGISTER(Sub);
         MODEL_OP_TYPE_REGISTER(Tensor);
+        MODEL_OP_TYPE_REGISTER(Placeholder);
         MODEL_OP_TYPE_REGISTER(Transpose);
         MODEL_OP_TYPE_REGISTER(SendPacket);
         MODEL_OP_TYPE_REGISTER(RecvPacket);
diff --git a/ark/model_buffer_manager.hpp b/ark/model_buffer_manager.hpp
deleted file mode 100644
index 4baaec7fe..000000000
--- a/ark/model_buffer_manager.hpp
+++ /dev/null
@@ -1,57 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_MODEL_BUFFER_MANAGER_HPP_
-#define ARK_MODEL_BUFFER_MANAGER_HPP_
-
-#include <tuple>
-#include <unordered_map>
-
-namespace ark {
-// Manages externally allocated buffers not in the ARK memory space.
-class ModelBufferManager {
-   public:
-    static ModelBufferManager& get_instance() {
-        static ModelBufferManager instance;
-        return instance;
-    }
-
-    void register_buffer(size_t id, void* data, size_t size) {
-        buffers_[id] = std::make_tuple(data, size);
-    }
-
-    void* get_buffer(size_t id) {
-        auto it = buffers_.find(id);
-        if (it != buffers_.end()) {
-            return std::get<0>(it->second);
-        }
-        return nullptr;
-    }
-
-    size_t get_buffer_size(size_t id) {
-        auto it = buffers_.find(id);
-        if (it != buffers_.end()) {
-            return std::get<1>(it->second);
-        }
-        return 0;
-    }
-
-    const std::unordered_map<size_t, std::tuple<void*, size_t>>& get_buffers()
-        const {
-        return buffers_;
-    }
-
-    void clear_buffers() { buffers_.clear(); }
-
-    bool is_empty() const { return buffers_.empty(); }
-
-   private:
-    // Maps buffer IDs to pointers and sizes.
-    std::unordered_map<size_t, std::tuple<void*, size_t>> buffers_;
-    ModelBufferManager() {}
-    ModelBufferManager(const ModelBufferManager&) = delete;
-    ModelBufferManager& operator=(const ModelBufferManager&) = delete;
-};
-}  // namespace ark
-
-#endif  // ARK_MODEL_BUFFER_MANAGER_HPP_
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
new file mode 100644
index 000000000..73c1c1b25
--- /dev/null
+++ b/ark/ops/ops_placeholder.cpp
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ops_placeholder.hpp"
+
+#include "external_buffer_registry.hpp"
+#include "logging.hpp"
+#include "ops_common.hpp"
+
+namespace ark {
+
+ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                                       ModelDataType data_type,
+                                       const Dims &strides, const Dims &offsets,
+                                       const Dims &padded_shape, void *data)
+    : ModelOp("Placeholder", true) {
+    if (!buffer) {
+        buffer = std::make_shared<ModelBuffer>(-1, true);
+    }
+
+    ExternalBufferRegistry::get_instance().set(buffer->id(), data);
+
+    ModelTensorRef tensor = std::make_shared<ModelTensor>(
+        data_type, buffer, shape, strides, offsets, padded_shape);
+
+    result_tensors_.emplace_back(tensor);
+
+    verify();
+}
+
+Tensor Model::placeholder(const Dims &shape, const DataType &data_type,
+                          const Dims &strides, const Dims &offsets,
+                          const Dims &padded_shape, int rank, void *data,
+                          const std::string &name) {
+    if (rank != -1) {
+        if (rank == this->rank()) {
+            rank = -1;
+        } else if (rank < 0 || rank >= this->world_size()) {
+            ERR(ModelError, "Invalid rank %d", rank);
+        }
+    }
+    return impl_
+        ->create_op<ModelOpPlaceholder>(
+            name, std::make_shared<ModelBuffer>(rank, true), shape,
+            data_type.ref(), strides, offsets, padded_shape, data)
+        ->result_tensors()[0];
+}
+
+}  // namespace ark
diff --git a/ark/ops/ops_placeholder.hpp b/ark/ops/ops_placeholder.hpp
new file mode 100644
index 000000000..14ae53144
--- /dev/null
+++ b/ark/ops/ops_placeholder.hpp
@@ -0,0 +1,23 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_OPS_PLACEHOLDER_HPP_
+#define ARK_OPS_PLACEHOLDER_HPP_
+
+#include "ark/model.hpp"
+#include "model/model_op.hpp"
+
+namespace ark {
+
+class ModelOpPlaceholder : public ModelOp {
+   public:
+    ModelOpPlaceholder() = default;
+    ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
+                       ModelDataType data_type, const Dims &strides,
+                       const Dims &offsets, const Dims &padded_shape,
+                       void *data = nullptr);
+};
+
+}  // namespace ark
+
+#endif  // ARK_OPS_PLACEHOLDER_HPP_
diff --git a/ark/ops/ops_placeholder_test.cpp b/ark/ops/ops_placeholder_test.cpp
new file mode 100644
index 000000000..e91629fc8
--- /dev/null
+++ b/ark/ops/ops_placeholder_test.cpp
@@ -0,0 +1,103 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/executor.hpp"
+#include "gpu/gpu.hpp"
+#include "logging.hpp"
+#include "model/model_op.hpp"
+#include "ops_test_common.hpp"
+
+ark::unittest::State test_ops_placeholder() {
+    ark::Model model;
+    ark::Dims shape{10, 1};
+
+    // Allocate GPU memory for the external buffer
+    float *d_ext_buffer = nullptr;
+    UNITTEST_EQ(ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float)),
+                ark::gpuSuccess);
+
+    // Initialize GPU Memory
+    std::vector<float> h_ext_buffer(shape.nelems());
+    std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
+    UNITTEST_EQ(ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                               shape.nelems() * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
+
+    // Associate the initialized device buffer with a tensor produced from a
+    // placeholder operation
+    ark::Tensor tns =
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, d_ext_buffer);
+
+    ark::Tensor res = model.add(tns, 1.0);
+
+    ark::DefaultExecutor exe(model);
+
+    exe.launch();
+    exe.run(1);
+    exe.stop();
+
+    UNITTEST_EQ(exe.tensor_address(tns), d_ext_buffer);
+
+    // Copy tensor data from GPU to CPU
+    std::vector<float> h_res(shape.nelems(), 0.0f);
+    exe.tensor_read(res, h_res);
+
+    for (auto i = 0; i < shape.nelems(); ++i) {
+        UNITTEST_EQ(h_res[i], i + 2);
+    }
+
+    UNITTEST_EQ(ark::gpuFree(d_ext_buffer), ark::gpuSuccess);
+
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_placeholder_delayed_binding() {
+    ark::Model model;
+    ark::Dims shape{10, 1};
+
+    float *d_ext_buffer = nullptr;
+    UNITTEST_EQ(ark::gpuMalloc(&d_ext_buffer, shape.nelems() * sizeof(float)),
+                ark::gpuSuccess);
+
+    std::vector<float> h_ext_buffer(shape.nelems());
+    std::iota(h_ext_buffer.begin(), h_ext_buffer.end(), 1.0f);
+    UNITTEST_EQ(ark::gpuMemcpy(d_ext_buffer, h_ext_buffer.data(),
+                               shape.nelems() * sizeof(float),
+                               ark::gpuMemcpyHostToDevice),
+                ark::gpuSuccess);
+
+    // Create a placeholder tensor without binding the buffer yet
+    ark::Tensor tns =
+        model.placeholder(shape, ark::FP32, {}, {}, {}, -1, nullptr);
+
+    ark::Tensor res = model.add(tns, 1.0);
+
+    ark::DefaultExecutor exe(model);
+
+    // Delay the binding by providing the tensor-to-address mapping at launch
+    std::unordered_map<ark::Tensor, void *> tensor_bindings;
+    tensor_bindings[tns] = reinterpret_cast<void *>(d_ext_buffer);
+
+    exe.launch(tensor_bindings);
+    exe.run(1);
+    exe.stop();
+
+    // Copy tensor data from GPU to CPU
+    std::vector<float> h_res(shape.nelems(), 0.0f);
+    exe.tensor_read(res, h_res);
+
+    for (auto i = 0; i < shape.nelems(); ++i) {
+        UNITTEST_EQ(h_res[i], i + 2);
+    }
+    UNITTEST_EQ(ark::gpuFree(d_ext_buffer), ark::gpuSuccess);
+
+    return ark::unittest::SUCCESS;
+}
+
+int main() {
+    ark::init();
+    UNITTEST(test_ops_placeholder);
+    UNITTEST(test_placeholder_delayed_binding);
+    return ark::unittest::SUCCESS;
+}
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 8ab982106..25456be54 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,12 +2,9 @@
 # Licensed under the MIT license.
 
 import numpy
+from .torch import torch
 from . import _ark_core
 
-try:
-    import torch
-except ImportError:
-    from . import torch_mock as torch
 
 _REGISTRY_DATA_TYPE = {
     "fp32": {"np": numpy.float32, "torch": torch.float32},
diff --git a/python/ark/model.py b/python/ark/model.py
index 87af88f49..2a977b8f3 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -31,6 +31,13 @@ def get_world_size():
         """
         return _ModelState.world_size
 
+    @staticmethod
+    def get_device_id():
+        """
+        Get the device id.
+        """
+        return _ModelState.device_id
+
     @staticmethod
     def set_rank(rank: int):
         """
@@ -45,6 +52,15 @@ def set_world_size(world_size: int):
         """
         _ModelState.world_size = world_size
 
+    @staticmethod
+    def set_device_id(device_id: int):
+        """
+        Set the device id.
+        """
+        if device_id < 0:
+            raise ValueError("device_id must be non-negative")
+        _ModelState.device_id = device_id
+
     @staticmethod
     def reset():
         """
@@ -81,3 +97,4 @@ class _ModelState:
     model: Model = None
     rank: int = 0
     world_size: int = 1
+    device_id: int = 0
diff --git a/python/ark/module.py b/python/ark/module.py
index 49d2ddf00..b5744f100 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -4,19 +4,12 @@
 import logging
 import numpy as np
 from typing import Any, Dict, Union
-from .tensor import Tensor, Parameter
+from .tensor import Parameter
+from .torch import torch, _no_torch
 from .runtime import Runtime
-from .init import init
 from .model import Model
-
-try:
-    import torch
-
-    _no_torch = False
-except ImportError:
-    from . import torch_mock as torch
-
-    _no_torch = True
+from .data_type import DataType
+from .ops import placeholder
 
 
 class Module:
@@ -43,7 +36,10 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
         elif not _no_torch and isinstance(__value, torch.nn.Parameter):
-            __value = Parameter(__value)
+            shape, dtype = list(__value.shape), DataType.from_torch(
+                __value.dtype
+            )
+            __value = Parameter(placeholder(shape, dtype, data=__value), True)
             self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
@@ -151,14 +147,16 @@ def forward(ctx, ark_module, *args, **kwargs):
         input_requires_grad = 0
         for arg in args:
             if isinstance(arg, torch.Tensor):
-                input_args.append(Tensor.from_torch(arg))
+                shape, dtype = list(arg.shape), DataType.from_torch(arg.dtype)
+                input_args.append(placeholder(shape, dtype, data=arg))
                 if arg.requires_grad:
                     input_requires_grad += 1
             else:
                 input_args.append(arg)
         for k, v in kwargs.items():
             if isinstance(v, torch.Tensor):
-                input_kwargs[k] = Tensor.from_torch(v)
+                shape, dtype = list(arg.shape), DataType.from_torch(arg.dtype)
+                input_kwargs[k] = placeholder(shape, dtype, data=v)
                 if v.requires_grad:
                     input_requires_grad += 1
             else:
@@ -180,7 +178,12 @@ def backward(ctx, *grad_outputs):
         PyTorch parameters.
         """
         Model.reset()
-        ark_grad_outputs = [Tensor.from_torch(grad) for grad in grad_outputs]
+        # i think we should support placeholder initialization
+        # with just pytorch tensor
+        ark_grad_outputs = []
+        for grad in grad_outputs:
+            shape, dtype = list(grad.shape), DataType.from_torch(grad.dtype)
+            ark_grad_outputs.append(placeholder(shape, dtype, data=grad))
         grads = ctx.ark_module.backward(*ark_grad_outputs)
         grad_inputs, grad_weights = (
             grads[: ctx.num_inp_grad],
diff --git a/python/ark/ops.py b/python/ark/ops.py
index f8b75a70b..50f800b10 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -1,9 +1,10 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from typing import List, Iterable, Union
+from typing import List, Iterable, Union, Optional
 
-from .tensor import Dims, Tensor, Parameter, NullTensor
+from .tensor import Dims, Tensor, Parameter, NullTensor, _cpp_tensor
+from .torch import torch, _no_torch
 from .data_type import DataType, fp32
 from .model import Model
 
@@ -12,42 +13,6 @@ def _is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
-def _tensor(
-    shape: Iterable[int],
-    dtype: DataType = fp32,
-    strides: Iterable[int] = [],
-    offsets: Iterable[int] = [],
-    padded_shape: Iterable[int] = [],
-    rank: int = -1,
-    name: str = "",
-) -> Tensor:
-    if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
-    if not _is_list_or_tuple(strides):
-        raise ValueError("strides should be a list or tuple of integers")
-    if not _is_list_or_tuple(offsets):
-        raise ValueError("offsets should be a list or tuple of integers")
-    if not _is_list_or_tuple(padded_shape):
-        raise ValueError("padded_shape should be a list or tuple of integers")
-    # only support tensors with up to 4 dimensions
-    if (
-        len(shape) > 4
-        or len(strides) > 4
-        or len(offsets) > 4
-        or len(padded_shape) > 4
-    ):
-        raise ValueError("Only support tensors with up to 4 dimensions")
-    return Model.get_model().tensor(
-        Dims(shape),
-        dtype.ctype(),
-        Dims(strides),
-        Dims(offsets),
-        Dims(padded_shape),
-        rank,
-        name,
-    )
-
-
 def add(
     input: Union[Tensor, float],
     other: Union[Tensor, float],
@@ -258,6 +223,35 @@ def noop(input: Tensor, name: str = "noop"):
     Model.get_model().noop(input._tensor, name)
 
 
+def placeholder(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: Union[int, torch.Tensor] = 0,
+    name: str = "placeholder",
+) -> Tensor:
+    """ """
+    if not _no_torch and isinstance(data, torch.Tensor):
+        # Should we support initializing shape dtype stride offset and padded_shape
+        # just by passing in a torch.Tensor?
+        data = data.data_ptr()
+    return Tensor(
+        Model.get_model().placeholder(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            data,
+            name,
+        )
+    )
+
+
 def reduce_max(
     input: Tensor,
     axis: int,
@@ -488,7 +482,9 @@ def tensor(
     tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
     return Tensor(
-        _tensor(shape, dtype, strides, offsets, padded_shape, rank, name)
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, rank, None, name
+        )
     )
 
 
@@ -554,7 +550,7 @@ def parameter(
     Construct a parameter with given shape and data type.
     """
     return Parameter(
-        _tensor(shape, dtype, strides, offsets, padded_shape, name)
+        _cpp_tensor(shape, dtype, strides, offsets, padded_shape, None, name)
     )
 
 
@@ -630,6 +626,7 @@ def all_reduce(
 
 __all__ = [
     "tensor",
+    "placeholder",
     "parameter",
     "reshape",
     "identity",
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index fa953a873..6f20516a8 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -5,7 +5,10 @@
 from enum import Enum
 
 from ._ark_core import _Executor
+from .torch import torch
 from .planner import Planner, Plan
+from .model import Model
+from typing import Dict
 
 
 class _RuntimeState:
@@ -21,6 +24,8 @@ class Runtime:
     Convenience class for running a model.
     """
 
+    _loop_mode: bool = True
+
     class State(Enum):
         """
         Runtime states.
@@ -70,16 +75,19 @@ def running(self) -> bool:
     def launch(
         self,
         plan: Plan = None,
-        device_id: int = 0,
+        device_id: int = -1,
         stream: int = 0,
         loop_mode: bool = True,
+        tensor_mappings: Dict = {},
     ):
         """
         Create an executor and schedule the ARK model. The scheduler will generate
         the CUDA kernels. The GPU context and the connection between GPUs will be
         initialized. The executor will compile the cuda kernels and launch the ARK runtime.
         """
-        if device_id < 0:
+        if device_id == -1:
+            device_id = Model.get_device_id()
+        elif device_id < 0:
             logging.error(f"Invalid device_id: {device_id}")
             raise ValueError(f"Invalid device_id: {device_id}")
         plan = Planner(device_id).plan() if plan is None else plan
@@ -87,7 +95,13 @@ def launch(
         if self.launched():
             # Stop the current running model
             self.stop()
-
+        for ark_tensor in list(tensor_mappings.keys()):
+            torch_tensor = tensor_mappings[ark_tensor]
+            if not isinstance(torch_tensor, torch.Tensor):
+                raise ValueError("Must bind PyTorch tensor")
+            internal_ark_tensor = ark_tensor._tensor
+            tensor_mappings[internal_ark_tensor] = torch_tensor.data_ptr()
+            del tensor_mappings[ark_tensor]
         # Recompile if the previous launch was not compiled with the same info
         # or if this is the first launch
         if (
@@ -95,19 +109,32 @@ def launch(
             or device_id != self.executor.device_id()
         ):
             self.executor.compile(plan_str, device_id)
-
-        self.executor.launch(stream, loop_mode)
+        self.executor.launch(stream, loop_mode, tensor_mappings)
         self.state = Runtime.State.LaunchedNotRunning
+        Runtime._loop_mode = loop_mode
 
-    def run(self, iter=1, non_blocking=False):
+    def run(self, iter=1, non_blocking=False, tensor_mappings={}):
         """
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
+        if Runtime._loop_mode and tensor_mappings:
+            raise ValueError(
+                "`loop_mode` argument when calling `runtime.launch` "
+                "must be set to false in order to pass non-empty "
+                "tensor mappings in `runtime.run`."
+            )
         if self.state != Runtime.State.LaunchedNotRunning:
             logging.error(f"ARK runtime is not launched")
             raise RuntimeError(f"ARK runtime is not launched")
         self.state = Runtime.State.Running
-        self.executor.run(iter)
+        for ark_tensor in list(tensor_mappings.keys()):
+            torch_tensor = tensor_mappings[ark_tensor]
+            if not isinstance(torch_tensor, torch.Tensor):
+                raise ValueError("Must bind PyTorch tensor")
+            internal_ark_tensor = ark_tensor._tensor
+            tensor_mappings[internal_ark_tensor] = torch_tensor.data_ptr()
+            del tensor_mappings[ark_tensor]
+        self.executor.run(iter, tensor_mappings)
         if not non_blocking:
             self.wait()
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 45a54d169..5fa361bef 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -2,21 +2,14 @@
 # Licensed under the MIT license.
 
 import numpy as np
-from typing import Callable, List, Union, Type
+from typing import Callable, Iterable, List, Union, Type
 
 from ._ark_core import _Dims, _Tensor, _NullTensor
-from .data_type import DataType
+from .torch import torch, _no_torch
+from .data_type import DataType, fp32
 from .runtime import Runtime
 from .model import Model
 
-try:
-    import torch
-
-    _no_torch = False
-except ImportError:
-    from . import torch_mock as torch
-
-    _no_torch = True
 
 NullTensor = _NullTensor
 
@@ -46,6 +39,32 @@ def __init__(
         self.initializer: Initializer = initializer
         self.requires_grad = requires_grad
 
+    def __hash__(self):
+        return self._tensor.id()
+
+    def __eq__(self, other):
+        if not isinstance(other, Tensor):
+            return False
+        return self._tensor.id() == other._tensor.id()
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        if kwargs is None:
+            kwargs = {}
+        new_args = []
+        for arg in args:
+            if isinstance(arg, Tensor):
+                new_args.append(Tensor.to_torch(arg))
+            else:
+                new_args.append(arg)
+        new_kwargs = {}
+        for key, value in kwargs.items():
+            if isinstance(value, Tensor):
+                new_kwargs[key] = Tensor.to_torch(value)
+            else:
+                new_kwargs[key] = value
+        return func(*new_args, **new_kwargs)
+
     def shape(self) -> List[int]:
         """
         Returns the shape of the tensor.
@@ -137,7 +156,8 @@ def from_dlpack(ext_tensor) -> "Tensor":
         """
         Copies the tensor from a DLPack tensor to the device.
         """
-        return Tensor(_Tensor(ext_tensor))
+        # return Tensor(_Tensor(ext_tensor))
+        raise NotImplementedError("from_dlpack is not implemented yet")
 
     def to_torch(self) -> torch.Tensor:
         """
@@ -162,7 +182,14 @@ def from_torch(tensor: torch.Tensor) -> "Tensor":
             raise ValueError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
             raise ValueError("Torch tensor must be on a device.")
-        ark_tensor = Tensor.from_dlpack(torch.utils.dlpack.to_dlpack(tensor))
+        # TODO: support strides and offsets
+        ark_tensor = Tensor(
+            _cpp_tensor(
+                shape=list(tensor.shape),
+                dtype=DataType.from_torch(tensor.dtype),
+                data=tensor.data_ptr(),
+            )
+        )
         # Share ownership of the memory with the torch tensor
         ark_tensor.__torch_buffer__ = tensor
         return ark_tensor
@@ -216,33 +243,36 @@ def initialize(self) -> "Tensor":
         return self
 
 
-class Parameter(Tensor, torch.nn.Parameter):
+class Parameter(Tensor):
     """
     A tensor as a parameter.
     """
 
     def __init__(
         self,
-        tensor: Union[_Tensor, "torch.nn.Parameter"],
+        tensor: _Tensor,
+        from_torch: bool,
     ):
         """
         Initializes a new instance of the Parameter class.
+        Args:
+            _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            from_torch: Indicates if the Parameter is tied to a torch.nn.Paramter
         """
-        if not _no_torch and isinstance(tensor, torch.nn.Parameter):
-            ark_tensor = Tensor.from_torch(tensor)
-            core_tensor = ark_tensor._tensor
+        if not _no_torch and from_torch:
+            _tensor = tensor._tensor
             self.torch_param = tensor
             self.staged_tensor = None
             Tensor.__init__(
                 self,
-                core_tensor,
+                _tensor,
                 requires_grad=tensor.requires_grad,
             )
         elif isinstance(tensor, _Tensor):
-            core_tensor = tensor
+            _tensor = tensor
             self.torch_param = None
             self.staged_tensor = None
-            Tensor.__init__(self, core_tensor, requires_grad=False)
+            Tensor.__init__(self, _tensor, requires_grad=False)
         else:
             raise TypeError(
                 "tensor must be an ARK tensor or a torch.nn.Parameter"
@@ -263,3 +293,57 @@ def update_gradient(self, ark_tensor: Tensor):
         if ark_tensor is None or not isinstance(ark_tensor, Tensor):
             raise ValueError("cannot use non-ARK tensor to update ARK gradient")
         self.staged_tensor = ark_tensor
+
+
+def _is_list_or_tuple(obj):
+    return isinstance(obj, list) or isinstance(obj, tuple)
+
+
+def _cpp_tensor(
+    shape: Iterable[int],
+    dtype: DataType = fp32,
+    strides: Iterable[int] = [],
+    offsets: Iterable[int] = [],
+    padded_shape: Iterable[int] = [],
+    rank: int = -1,
+    data: int = None,
+    name: str = "",
+) -> Tensor:
+    if not _is_list_or_tuple(shape):
+        raise ValueError("shape should be a list or tuple of integers")
+    if not _is_list_or_tuple(strides):
+        raise ValueError("strides should be a list or tuple of integers")
+    if not _is_list_or_tuple(offsets):
+        raise ValueError("offsets should be a list or tuple of integers")
+    if not _is_list_or_tuple(padded_shape):
+        raise ValueError("padded_shape should be a list or tuple of integers")
+    # only support tensors with up to 4 dimensions
+    if (
+        len(shape) > 4
+        or len(strides) > 4
+        or len(offsets) > 4
+        or len(padded_shape) > 4
+    ):
+        raise ValueError("Only support tensors with up to 4 dimensions")
+    if data is not None:
+        cpp_tensor = Model.get_model().placeholder(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            data,
+            name,
+        )
+    else:
+        cpp_tensor = Model.get_model().tensor(
+            Dims(shape),
+            dtype.ctype(),
+            Dims(strides),
+            Dims(offsets),
+            Dims(padded_shape),
+            rank,
+            name,
+        )
+    return cpp_tensor
diff --git a/python/ark/torch/__init__.py b/python/ark/torch/__init__.py
new file mode 100644
index 000000000..c1b6db3a2
--- /dev/null
+++ b/python/ark/torch/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+try:
+    import torch
+
+    _no_torch = False
+except ImportError:
+    from . import mock as torch
+
+    _no_torch = True
diff --git a/python/ark/torch_mock.py b/python/ark/torch/mock.py
similarity index 100%
rename from python/ark/torch_mock.py
rename to python/ark/torch/mock.py
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 5b4e7959f..3ee851c27 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -8,6 +8,7 @@
 
 #include <ark/executor.hpp>
 #include <ark/model.hpp>
+#include <unordered_map>
 
 #include "gpu/gpu_memory.hpp"
 #include "logging.hpp"
@@ -134,7 +135,8 @@ DLTensor SharedTensor::dl_tensor() const {
 
 }  // namespace ark
 
-static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tensor) {
+static py::capsule tensor_to_dlpack(ark::Executor &self,
+                                    const ark::Tensor &tensor) {
     auto shared_tensor = new ark::SharedTensor(self, tensor);
     DLManagedTensor *dl_managed_tensor = new DLManagedTensor();
     dl_managed_tensor->dl_tensor = shared_tensor->dl_tensor();
@@ -146,8 +148,9 @@ static py::capsule tensor_to_dlpack(ark::Executor &self, const ark::Tensor &tens
         }
     };
     const char *capsule_name = "dltensor";
-    PyObject *dl_capsule = PyCapsule_New(static_cast<void *>(dl_managed_tensor),
-                                         capsule_name, [](PyObject *capsule) {
+    PyObject *dl_capsule = PyCapsule_New(
+        static_cast<void *>(dl_managed_tensor), capsule_name,
+        [](PyObject *capsule) {
             const char *name = PyCapsule_GetName(capsule);
             auto *dl_managed_tensor = static_cast<DLManagedTensor *>(
                 PyCapsule_GetPointer(capsule, name));
@@ -171,11 +174,37 @@ void register_executor(py::module &m) {
         .def("name", &ark::Executor::name)
         .def("compile", &ark::Executor::compile, py::arg("device_id"),
              py::arg("plan"), py::arg("name") = "executor")
-        .def("launch", [](ark::Executor *self, uintptr_t stream, bool loop_mode) {
-                 self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode);
-             },
-             py::arg("stream") = 0, py::arg("loop_mode") = true)
-        .def("run", &ark::Executor::run, py::arg("iter"))
+        .def(
+            "launch",
+            [](ark::Executor *self, uintptr_t stream, bool loop_mode,
+               const std::unordered_map<ark::Tensor, uintptr_t>
+                   &placeholder_data) {
+                std::unordered_map<ark::Tensor, void *> tensor_ptr_map;
+                for (const auto &[tensor, addr] : placeholder_data) {
+                    tensor_ptr_map[tensor] = reinterpret_cast<void *>(addr);
+                }
+
+                self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode,
+                             tensor_ptr_map);
+            },
+            py::arg("stream") = 0, py::arg("loop_mode") = true,
+            py::arg("placeholder_data") =
+                std::unordered_map<ark::Tensor, void *>())
+
+        .def(
+            "run",
+            [](ark::Executor *self, int iter,
+               const std::unordered_map<ark::Tensor, uintptr_t>
+                   &placeholder_data) {
+                std::unordered_map<ark::Tensor, void *> tensor_ptr_map;
+                for (const auto &[tensor, addr] : placeholder_data) {
+                    tensor_ptr_map[tensor] = reinterpret_cast<void *>(addr);
+                }
+                self->run(iter, tensor_ptr_map);
+            },
+            py::arg("iter"),
+            py::arg("placeholder_data") =
+                std::unordered_map<ark::Tensor, void *>())
         .def("wait", &ark::Executor::wait, py::arg("max_spin_count") = -1)
         .def("stop", &ark::Executor::stop, py::arg("max_spin_count") = -1)
         .def("barrier", &ark::Executor::barrier)
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c224a3d5b..704222c63 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -1,6 +1,7 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
+#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
@@ -8,8 +9,65 @@
 #include <ark/model.hpp>
 #include <ark/model_graph.hpp>
 
+#include "logging.hpp"
+
 namespace py = pybind11;
 
+struct DLTensorMetadata {
+    void *data_ptr;
+    int32_t device_id;
+    DLDeviceType device_type;
+    int32_t ndim;
+    DLDataType dtype;
+    std::vector<int64_t> shape;
+    std::vector<int64_t> strides;
+    uint64_t byte_offset;
+};
+
+static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor *dl_tensor) {
+    DLTensorMetadata metadata;
+    metadata.data_ptr = dl_tensor->dl_tensor.data;
+    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
+    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
+    metadata.ndim = dl_tensor->dl_tensor.ndim;
+    metadata.dtype = dl_tensor->dl_tensor.dtype;
+    metadata.shape.assign(
+        dl_tensor->dl_tensor.shape,
+        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
+    if (dl_tensor->dl_tensor.strides != nullptr) {
+        metadata.strides.assign(
+            dl_tensor->dl_tensor.strides,
+            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
+    }
+    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
+    return metadata;
+}
+
+static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
+    if (dl_dtype.lanes != 1) {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    ark::DataType ark_dtype;
+    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
+        ark_dtype = ark::FP32;
+    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::FP16;
+    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
+        ark_dtype = ark::BF16;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::INT32;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
+        ark_dtype = ark::UINT32;
+    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::INT8;
+    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
+        ark_dtype = ark::UINT8;
+    } else {
+        ERR(ark::UnsupportedError, "unsupported data type");
+    }
+    return ark_dtype;
+}
+
 void register_model(py::module &m) {
     py::class_<ark::Model, ark::ModelGraph>(m, "_Model")
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
@@ -71,6 +129,19 @@ void register_model(py::module &m) {
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
         .def("noop", &ark::Model::noop, py::arg("input"), py::arg("name"))
+        .def(
+            "placeholder",
+            [](ark::Model &model, const ark::Dims &shape,
+               const ark::DataType &data_type, const ark::Dims &strides,
+               const ark::Dims &offsets, const ark::Dims &padded_shape,
+               int rank, uintptr_t data, const std::string &name) {
+                return model.placeholder(shape, data_type, strides, offsets,
+                                         padded_shape, rank,
+                                         reinterpret_cast<void *>(data), name);
+            },
+            py::arg("shape"), py::arg("data_type"), py::arg("strides"),
+            py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
+            py::arg("data"), py::arg("name"))
         .def("reduce_max", &ark::Model::reduce_max, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
              py::arg("name"))
@@ -104,14 +175,9 @@ void register_model(py::module &m) {
                                const std::string &>(&ark::Model::sub),
              py::arg("input"), py::arg("other"), py::arg("output"),
              py::arg("name"))
-        .def("tensor",
-             py::overload_cast<const ark::Dims &, const ark::DataType &,
-                               const ark::Dims &, const ark::Dims &,
-                               const ark::Dims &, int, const std::string &>(
-                 &ark::Model::tensor),
-             py::arg("shape"), py::arg("data_type"), py::arg("strides"),
-             py::arg("offsets"), py::arg("padded_shape"), py::arg("rank"),
-             py::arg("name"))
+        .def("tensor", &ark::Model::tensor, py::arg("shape"),
+             py::arg("data_type"), py::arg("strides"), py::arg("offsets"),
+             py::arg("padded_shape"), py::arg("rank"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
              py::arg("permutation"), py::arg("output"), py::arg("name"))
         .def("all_reduce", &ark::Model::all_reduce, py::arg("input"),
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 5abb35c66..5c28563de 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -1,87 +1,16 @@
 // Copyright (c) Microsoft Corporation.
 // Licensed under the MIT license.
 
-#include <dlpack/dlpack.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
 #include <ark/tensor.hpp>
 
-#include "logging.hpp"
-
 namespace py = pybind11;
 
-struct DLTensorMetadata {
-    void* data_ptr;
-    int32_t device_id;
-    DLDeviceType device_type;
-    int32_t ndim;
-    DLDataType dtype;
-    std::vector<int64_t> shape;
-    std::vector<int64_t> strides;
-    uint64_t byte_offset;
-};
-
-static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor* dl_tensor) {
-    DLTensorMetadata metadata;
-    metadata.data_ptr = dl_tensor->dl_tensor.data;
-    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
-    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
-    metadata.ndim = dl_tensor->dl_tensor.ndim;
-    metadata.dtype = dl_tensor->dl_tensor.dtype;
-    metadata.shape.assign(
-        dl_tensor->dl_tensor.shape,
-        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
-    if (dl_tensor->dl_tensor.strides != nullptr) {
-        metadata.strides.assign(
-            dl_tensor->dl_tensor.strides,
-            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
-    }
-    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
-    return metadata;
-}
-
-static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
-    if (dl_dtype.lanes != 1) {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    ark::DataType ark_dtype;
-    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
-        ark_dtype = ark::FP32;
-    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::FP16;
-    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::BF16;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::INT32;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::UINT32;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::INT8;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::UINT8;
-    } else {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    return ark_dtype;
-}
-
 void register_tensor(py::module& m) {
     py::class_<ark::Tensor>(m, "_Tensor")
-        .def(py::init([](py::capsule capsule) {
-            DLManagedTensor* dl_tensor = (DLManagedTensor*)capsule;
-            if (!dl_tensor) {
-                ERR(ark::InvalidUsageError,
-                    "Capsule does not contain a DLManagedTensor");
-            }
-            DLTensorMetadata metadata = extractDLTensorMetadata(dl_tensor);
-            int32_t device_id = metadata.device_id;
-            void* data_ptr = metadata.data_ptr;
-            auto shape = metadata.shape;
-
-            return ark::Tensor(data_ptr, device_id, shape, from_dl_dtype(metadata.dtype));
-        }))
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape)
         .def("strides", &ark::Tensor::strides)
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
index 833b88662..7dbd48151 100644
--- a/python/unittest/test_conversion.py
+++ b/python/unittest/test_conversion.py
@@ -37,9 +37,9 @@ def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     input_tensor.from_numpy(input_tensor_host)
     other_tensor.from_numpy(other_tensor_host)
 
-    input_view = input_tensor.get_torch_view()
-    other_view = other_tensor.get_torch_view()
-    output_view = output_tensor.get_torch_view()
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
 
     runtime.run()
 
@@ -50,7 +50,7 @@ def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
     output_tensor_host = output_tensor.to_numpy()
 
     runtime.stop()
-    runtime.delete_all_runtimes()
+    runtime.reset()
 
     assert np.allclose(input_tensor_host, input_view_numpy)
     assert np.allclose(other_tensor_host, other_view_numpy)
@@ -83,9 +83,9 @@ def test_ark_to_torch_aliasing(dtype: ark.DataType):
     input_tensor.from_numpy(input_tensor_host)
     other_tensor.from_numpy(other_tensor_host)
 
-    input_view = input_tensor.get_torch_view()
-    other_view = other_tensor.get_torch_view()
-    output_view = output_tensor.get_torch_view()
+    input_view = input_tensor.to_torch()
+    other_view = other_tensor.to_torch()
+    output_view = output_tensor.to_torch()
     # make changes to the views
     input_view[1, 1] = 20
     other_view[0, 0] = 30
@@ -149,8 +149,16 @@ def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
-    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims,
+        dtype=ark.DataType.from_torch(dtype),
+        data=input_tensor.data_ptr(),
+    )
+    other_ark_view = ark.placeholder(
+        shape=tensor_dims,
+        dtype=ark.DataType.from_torch(dtype),
+        data=other_tensor.data_ptr(),
+    )
     output = ark_op(input_ark_view, other_ark_view)
     runtime = ark.Runtime()
     runtime.launch()
@@ -170,7 +178,11 @@ def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
     ark.init()
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     expected_output = torch_op(input_tensor).cpu().numpy()
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims,
+        dtype=ark.DataType.from_torch(dtype),
+        data=input_tensor,
+    )
     output = ark_op(input_ark_view)
     runtime = ark.Runtime()
     runtime.launch()
@@ -189,8 +201,12 @@ def test_torch_to_ark_aliasing(dtype, tensor_dims):
     input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
     other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
 
-    input_ark_view = ark.Tensor.from_torch(input_tensor)
-    other_ark_view = ark.Tensor.from_torch(other_tensor)
+    input_ark_view = ark.placeholder(
+        tensor_dims, dtype=ark.DataType.from_torch(dtype), data=input_tensor
+    )
+    other_ark_view = ark.placeholder(
+        tensor_dims, dtype=ark.DataType.from_torch(dtype), data=other_tensor
+    )
 
     output = ark.add(input_ark_view, other_ark_view)
     # Perform in place operations
@@ -205,3 +221,61 @@ def test_torch_to_ark_aliasing(dtype, tensor_dims):
     runtime.stop()
     runtime.reset()
     assert np.allclose(output_host, expected_output)
+
+
+# Staged View Tests
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.add, torch.add, (2, 3))],
+)
+def test_bin_op_staged(
+    dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    other_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view, other_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {
+        input_ark_view: input_tensor,
+        other_ark_view: other_tensor,
+    }
+    runtime.launch(tensor_mappings=tensor_mapping)
+    runtime.run()
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)
+
+
+@pytest.mark.parametrize(
+    "dtype, ark_op, torch_op, tensor_dims",
+    [(torch.float16, ark.exp, torch.exp, (3, 3))],
+)
+def test_unary_op_staged(
+    dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims
+):
+    ark.init()
+    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
+    expected_output = torch_op(input_tensor).cpu().numpy()
+    input_ark_view = ark.placeholder(
+        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
+    )
+    output = ark_op(input_ark_view)
+    runtime = ark.Runtime()
+    tensor_mapping = {input_ark_view: input_tensor}
+    runtime.launch(loop_mode=False)
+    runtime.run(tensor_mappings=tensor_mapping)
+    output_host = output.to_numpy()
+    runtime.stop()
+    runtime.reset()
+    assert np.allclose(output_host, expected_output)

From 55f5c51a4b6827f728f79416dee2a574c8cca129 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 25 Aug 2024 20:37:25 -0700
Subject: [PATCH 074/106] PyTorch tracer (#243)

Co-authored-by: Noli Gerawork <ngerawor@andrew.cmu.edu>
---
 ark/api/executor.cpp               | 1025 ++++++++++++++++------------
 ark/api/tensor.cpp                 |   21 +
 ark/buffer_registry.cpp            |   34 +
 ark/buffer_registry.hpp            |   41 ++
 ark/codegen.cpp                    |   61 +-
 ark/codegen.hpp                    |    5 +-
 ark/external_buffer_registry.cpp   |   29 -
 ark/external_buffer_registry.hpp   |   31 -
 ark/gpu/gpu_event.cpp              |   14 +-
 ark/gpu/gpu_event.hpp              |    3 +-
 ark/gpu/gpu_manager.cpp            |    3 +-
 ark/gpu/gpu_manager.hpp            |    4 +-
 ark/include/ark/executor.hpp       |   14 +-
 ark/include/ark/tensor.hpp         |    6 +
 ark/include/kernels/gemm_cutlass.h |  785 ++-------------------
 ark/model/model_buffer.cpp         |   21 +-
 ark/model/model_buffer.hpp         |    8 +
 ark/model/model_tensor.cpp         |   10 +
 ark/model/model_tensor.hpp         |    6 +
 ark/ops/ops_communication_test.cpp |    4 +-
 ark/ops/ops_placeholder.cpp        |    4 +-
 examples/ffn/Makefile              |   23 -
 examples/ffn/ffn.cc                |  450 ------------
 examples/ffn/main.py               |   73 ++
 python/ark/ops.py                  |    6 +-
 python/ark/runtime.py              |   13 +-
 python/ark/tensor.py               |   17 +
 python/ark/torch/tracer.py         |  355 ++++++++++
 python/executor_py.cpp             |   15 +-
 python/model_py.cpp                |   55 --
 python/tensor_py.cpp               |   13 +-
 31 files changed, 1298 insertions(+), 1851 deletions(-)
 create mode 100644 ark/buffer_registry.cpp
 create mode 100644 ark/buffer_registry.hpp
 delete mode 100644 ark/external_buffer_registry.cpp
 delete mode 100644 ark/external_buffer_registry.hpp
 delete mode 100644 examples/ffn/Makefile
 delete mode 100644 examples/ffn/ffn.cc
 create mode 100644 examples/ffn/main.py
 create mode 100644 python/ark/torch/tracer.py

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 50ec4c629..b73dbb9cd 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -14,9 +14,9 @@
 #include "ark/data_type.hpp"
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
+#include "buffer_registry.hpp"
 #include "codegen.hpp"
 #include "env.h"
-#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "gpu/gpu.hpp"
 #include "gpu/gpu_event.hpp"
@@ -142,266 +142,338 @@ static size_t tensor_stride_bytes(const Json &tensor) {
     return nelems * DataType::from_name(tensor["DataType"]).bytes();
 }
 
-class Executor::Impl {
+class CommResource {
    public:
-    Impl() : plan_json_(), device_id_(-1) {};
-    ~Impl();
+    CommResource(int device_id, int rank, int world_size);
 
-    int device_id() const { return device_id_; }
+    int rank() const { return rank_; }
 
-    Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
+    int world_size() const { return world_size_; }
 
-    std::shared_ptr<GpuMemory> buffer() const {
-        return buffers_.empty() ? nullptr : buffers_.back();
+    std::shared_ptr<mscclpp::Bootstrap> bootstrap() {
+        return comm_->bootstrap();
     }
 
-    std::string plan() const { return plan_json_.dump_pretty(); }
+    std::shared_ptr<mscclpp::Communicator> comm() { return comm_; }
 
-    const std::string &name() const { return name_; }
-
-    void compile(const std::string &plan, int device_id,
-                 const std::string &name);
-    void launch(Stream stream, bool loop_mode,
-                const std::unordered_map<Tensor, void *> &placeholder_data);
-    void run(int iter,
-             const std::unordered_map<Tensor, void *> &placeholder_data);
-    void wait(int64_t max_spin_count);
-    float stop(int64_t max_spin_count);
-    void barrier();
+    std::shared_ptr<mscclpp::ProxyService> proxy_service() {
+        return proxy_service_;
+    }
 
-    void *tensor_address(const Tensor &tensor) const;
+    struct ConnectionResource {
+        std::shared_ptr<mscclpp::Connection> connection;
+        std::vector<std::shared_ptr<mscclpp::SimpleProxyChannel>>
+            proxy_channels;
+        std::vector<std::shared_ptr<mscclpp::SmChannel>> sm_channels;
+    };
 
-    void tensor_read(const Tensor &tensor, void *data, size_t bytes,
-                     Stream stream, bool is_d2d) const;
-    void tensor_write(const Tensor &tensor, const void *data, size_t bytes,
-                      Stream stream, bool is_d2d) const;
+    struct RankResource {
+        int remote_rank;
+        std::shared_ptr<ConnectionResource> ipc;
+        std::shared_ptr<ConnectionResource> eth;
+        std::shared_ptr<ConnectionResource> ib;
+    };
 
-   protected:
-    friend class DefaultExecutor;
+    const std::shared_ptr<RankResource> resource(int rank) const {
+        auto it = rank_to_resource_.find(rank);
+        if (it == rank_to_resource_.end()) {
+            return nullptr;
+        }
+        return it->second;
+    }
 
-    gpuStream stream_raw_;
-    bool loop_mode_;
+    void connect(const PlanJson &plan_json, std::shared_ptr<GpuMemory> buffer);
 
    private:
-    void init(const PlanJson &plan_json, int device_id,
-              const std::string &name);
-    void init_communicator();
-    bool add_kernel_arg(size_t buf_id, bool is_external);
-    std::vector<void *> add_kernel_addr(
-        const std::unordered_map<Tensor, void *> &placeholder_data);
-
-    std::map<size_t, size_t> init_buffers(const Json &plan_json);
-    std::map<size_t, void *> init_buffer_addrs(
-        std::shared_ptr<GpuMemory> buffer,
-        const std::map<size_t, size_t> &buffer_id_to_offset);
-    std::set<int> init_remote_ranks(const Json &plan_json) const;
-    void init_channels(const std::set<int> &remote_ranks);
-
-    PlanJson plan_json_;
     int device_id_;
-    std::string name_;
-
     int rank_;
     int world_size_;
-
-    std::string kernel_name_;
-
-    bool is_launched_ = false;
-    bool is_recording_ = false;
-    float elapsed_msec_ = -1;
-
-    std::map<size_t, std::pair<std::string, void *>> buffer_id_to_kernel_arg_;
-    std::map<size_t, size_t> buffer_id_to_offset_;
-    std::map<size_t, void *> buffer_id_to_addr_;
-    size_t total_bytes_;
-    std::shared_ptr<CodeGenerator> codegen_;
-    std::shared_ptr<GpuEvent> timer_begin_;
-    std::shared_ptr<GpuEvent> timer_end_;
-    std::list<std::shared_ptr<GpuMemory>> buffers_;
-    std::shared_ptr<GpuHostMemory> flag_;
-    std::shared_ptr<GpuStream> stream_;
-    std::shared_ptr<GpuKernel> kernel_;
-
-    // For communication
     std::shared_ptr<mscclpp::Communicator> comm_;
     std::shared_ptr<mscclpp::ProxyService> proxy_service_;
-    std::map<int, std::vector<std::shared_ptr<mscclpp::SimpleProxyChannel>>>
-        rank_to_proxy_channels_;
-    std::map<int, std::vector<std::shared_ptr<mscclpp::SmChannel>>>
-        rank_to_sm_channels_;
+    std::map<int, std::shared_ptr<RankResource>> rank_to_resource_;
 };
 
-Executor::Impl::~Impl() {
-    if (is_launched_) stop(-1);
+CommResource::CommResource(int device_id, int rank, int world_size)
+    : device_id_(device_id), rank_(rank), world_size_(world_size) {
+    auto bootstrap = std::make_shared<mscclpp::TcpBootstrap>(rank, world_size);
+    std::stringstream ip_port;
+    ip_port << get_host(0) << ":" << get_env().mscclpp_port;
+    bootstrap->initialize(ip_port.str());
+    comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
+    proxy_service_ = std::make_shared<mscclpp::ProxyService>();
 }
 
-void Executor::Impl::init(const PlanJson &plan_json, int device_id,
-                          const std::string &name) {
-    if (device_id < 0) {
-        ERR(InvalidUsageError, "Invalid device ID ", device_id);
+void CommResource::connect(const PlanJson &plan_json,
+                           std::shared_ptr<GpuMemory> buffer) {
+    int rank = plan_json["Rank"];
+    std::set<int> remote_ranks;
+    for (auto &task_info : plan_json["TaskInfos"]) {
+        for (auto &op : task_info["Ops"]) {
+            for (auto &tns : op["ReadTensors"]) {
+                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
+                if (buffer->rank() != rank && buffer->rank() != -1) {
+                    remote_ranks.insert(buffer->rank());
+                }
+            }
+            for (auto &tns : op["WriteTensors"]) {
+                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
+                if (buffer->rank() != rank && buffer->rank() != -1) {
+                    remote_ranks.insert(buffer->rank());
+                }
+            }
+            for (auto &tns : op["ResultTensors"]) {
+                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
+                if (buffer->rank() != rank && buffer->rank() != -1) {
+                    remote_ranks.insert(buffer->rank());
+                }
+            }
+        }
     }
+    if (remote_ranks.empty()) return;
 
-    plan_json_ = plan_json;
-    device_id_ = device_id;
-    name_ = name;
-    buffer_id_to_offset_.clear();
-    buffer_id_to_kernel_arg_.clear();
-    total_bytes_ = 0;
+    int num_ranks_per_node = get_env().num_ranks_per_host;
+    auto rank_to_node = [&](int r) { return r / num_ranks_per_node; };
+    int this_node = rank_to_node(rank);
 
-    rank_ = plan_json_["Rank"].get<int>();
-    world_size_ = plan_json_["WorldSize"].get<int>();
+    const mscclpp::Transport IBs[] = {
+        mscclpp::Transport::IB0, mscclpp::Transport::IB1,
+        mscclpp::Transport::IB2, mscclpp::Transport::IB3,
+        mscclpp::Transport::IB4, mscclpp::Transport::IB5,
+        mscclpp::Transport::IB6, mscclpp::Transport::IB7};
 
-    if (rank_ < 0 || rank_ >= world_size_) {
-        ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
-            world_size_);
+    mscclpp::TransportFlags all_transports =
+        mscclpp::Transport::CudaIpc | mscclpp::Transport::Ethernet;
+    if (!get_env().disable_ib) {
+        all_transports |= IBs[device_id_];
     }
-    if (world_size_ > 1 && !comm_) {
-        init_communicator();
+    mscclpp::RegisteredMemory regmem =
+        comm_->registerMemory(buffer->ref(), buffer->bytes(), all_transports);
+
+    using ConnectionFuture =
+        mscclpp::NonblockingFuture<std::shared_ptr<mscclpp::Connection>>;
+    std::map<int, ConnectionFuture> rank_to_ipc_connection_future;
+    std::map<int, ConnectionFuture> rank_to_eth_connection_future;
+    std::map<int, ConnectionFuture> rank_to_ib_connection_future;
+    std::map<int, mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>
+        rank_to_remote_regmem_future;
+
+    for (auto remote_rank : remote_ranks) {
+        auto it = rank_to_resource_.find(remote_rank);
+        if (it != rank_to_resource_.end()) {
+            // connection already set
+            continue;
+        }
+        auto resource = std::make_shared<RankResource>();
+        rank_to_resource_[remote_rank] = resource;
+        int remote_node = rank_to_node(remote_rank);
+        if (remote_node == this_node) {
+            rank_to_ipc_connection_future[remote_rank] = comm_->connectOnSetup(
+                remote_rank, 0, mscclpp::Transport::CudaIpc);
+            resource->ipc = std::make_shared<ConnectionResource>();
+        }
+        if ((remote_node != this_node) && get_env().disable_ib) {
+            rank_to_eth_connection_future[remote_rank] = comm_->connectOnSetup(
+                remote_rank, 0, mscclpp::Transport::Ethernet);
+            resource->eth = std::make_shared<ConnectionResource>();
+        }
+        if (!get_env().disable_ib) {
+            rank_to_ib_connection_future[remote_rank] =
+                comm_->connectOnSetup(remote_rank, 0, IBs[device_id_]);
+            resource->ib = std::make_shared<ConnectionResource>();
+        }
+        comm_->sendMemoryOnSetup(regmem, remote_rank, 0);
+        rank_to_remote_regmem_future[remote_rank] =
+            comm_->recvMemoryOnSetup(remote_rank, 0);
     }
+    comm_->setup();
 
-    auto gpu_manager = GpuManager::get_instance(device_id_);
+    for (auto &[remote_rank, future] : rank_to_ipc_connection_future) {
+        rank_to_resource_[remote_rank]->ipc->connection = future.get();
+    }
+    for (auto &[remote_rank, future] : rank_to_eth_connection_future) {
+        rank_to_resource_[remote_rank]->eth->connection = future.get();
+    }
+    for (auto &[remote_rank, future] : rank_to_ib_connection_future) {
+        rank_to_resource_[remote_rank]->ib->connection = future.get();
+    }
 
-    if (!gpu_manager->info().arch->belongs_to(
-            Arch::from_name(plan_json.at("Architecture")))) {
-        LOG(WARN, "Architecture name of the plan `",
-            plan_json.at("Architecture").get<std::string>(),
-            "` is not compatible with the GPU architecture `",
-            gpu_manager->info().arch->name(), "`.");
+    mscclpp::MemoryId regmem_id = proxy_service_->addMemory(regmem);
+    std::map<int, mscclpp::RegisteredMemory> rank_to_remote_regmem;
+    std::map<int, mscclpp::MemoryId> rank_to_remote_regmem_id;
+    for (auto &[remote_rank, future] : rank_to_remote_regmem_future) {
+        rank_to_remote_regmem[remote_rank] = future.get();
+        rank_to_remote_regmem_id[remote_rank] =
+            proxy_service_->addMemory(rank_to_remote_regmem[remote_rank]);
     }
 
-    buffer_id_to_offset_ = init_buffers(plan_json_);
+    for (auto &[remote_rank, resource] : rank_to_resource_) {
+        auto add_proxy_channel =
+            [&](std::shared_ptr<ConnectionResource> conn_resource) {
+                if (!conn_resource) return;
+                conn_resource->proxy_channels.push_back(
+                    std::make_shared<mscclpp::SimpleProxyChannel>(
+                        proxy_service_->proxyChannel(
+                            proxy_service_->buildAndAddSemaphore(
+                                *comm_, conn_resource->connection)),
+                        rank_to_remote_regmem_id[remote_rank], regmem_id));
+            };
+        // NOTE: We can create multiple proxy channels here if we need in the
+        // future
+        add_proxy_channel(resource->ipc);
+        add_proxy_channel(resource->eth);
+        add_proxy_channel(resource->ib);
+    }
+    comm_->setup();
 
-    std::string buffer_id_to_offset_str;
-    for (const auto &kv : buffer_id_to_offset_) {
-        buffer_id_to_offset_str +=
-            std::to_string(kv.first) + ": " + std::to_string(kv.second) + ", ";
+    std::map<int,
+             std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>>
+        sm_semaphores;
+    for (auto &[remote_rank, resource] : rank_to_resource_) {
+        // NOTE: We can create multiple semaphores here if we need in the future
+        sm_semaphores[remote_rank].push_back(
+            std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(
+                *comm_, resource->ipc->connection));
     }
+    comm_->setup();
 
-    timer_begin_ = gpu_manager->create_event();
-    timer_end_ = gpu_manager->create_event();
-    if (total_bytes_ > 0) {
-        buffers_.push_back(gpu_manager->malloc(total_bytes_, 65536));
-        buffer_id_to_addr_ =
-            init_buffer_addrs(buffers_.back(), buffer_id_to_offset_);
+    for (auto &[remote_rank, resource] : rank_to_resource_) {
+        // NOTE: We can create multiple sm channels here if we need in the
+        // future
+        resource->ipc->sm_channels.push_back(
+            std::make_shared<mscclpp::SmChannel>(
+                sm_semaphores[remote_rank][0],
+                rank_to_remote_regmem[remote_rank], regmem.data(), nullptr));
     }
+}
 
-    codegen_ = std::make_shared<CodeGenerator>(plan_json_, buffer_id_to_offset_,
-                                               buffer_id_to_kernel_arg_);
+class PlanResourceKey {
+   public:
+    PlanResourceKey(const std::string &plan, int device_id,
+                    const std::string &name)
+        : plan_(plan), device_id_(device_id), name_(name) {}
 
-    flag_ = gpu_manager->malloc_host(
-        sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
+    bool operator<(const PlanResourceKey &other) const {
+        return std::tie(plan_, device_id_, name_) <
+               std::tie(other.plan_, other.device_id_, other.name_);
+    }
 
-    int threads_per_block = static_cast<int>(
-        codegen_->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
-    int num_sm = static_cast<int>(codegen_->num_procs());
-    size_t smem_block_total =
-        static_cast<size_t>(gpu_manager->info().smem_block_total);
+   private:
+    std::string plan_;
+    int device_id_;
+    std::string name_;
+};
 
-    if (world_size_ > 1 && total_bytes_ > 0) {
-        auto remote_ranks = init_remote_ranks(plan_json_);
-        init_channels(remote_ranks);
-    }
+class PlanResource {
+   public:
+    PlanResource(const PlanJson &plan_json, int device_id,
+                 const std::string &name,
+                 std::shared_ptr<CommResource> &comm_resource);
 
-    kernel_ = std::shared_ptr<GpuKernel>(
-        new GpuKernel(device_id_, codegen_->code(), {threads_per_block, 1, 1},
-                      {num_sm, 1, 1}, std::max(smem_block_total, size_t(4))));
-}
+    const PlanJson &plan_json() const { return plan_json_; }
 
-void Executor::Impl::init_communicator() {
-    auto bootstrap =
-        std::make_shared<mscclpp::TcpBootstrap>(rank_, world_size_);
-    std::stringstream ip_port;
-    ip_port << get_host(0) << ":" << get_env().mscclpp_port;
-    bootstrap->initialize(ip_port.str());
-    comm_ = std::make_shared<mscclpp::Communicator>(bootstrap);
-}
+    int device_id() const { return device_id_; }
 
-std::map<size_t, void *> Executor::Impl::init_buffer_addrs(
-    std::shared_ptr<GpuMemory> buffer,
-    const std::map<size_t, size_t> &buffer_id_to_offset) {
-    std::map<size_t, void *> buffer_id_to_addr;
-    // Reuse existing buffer addresses for new plans that use previous tensors
-    // from earlier plans
-    if (!buffer_id_to_addr_.empty()) {
-        buffer_id_to_addr = buffer_id_to_addr_;
-    }
-    for (const auto &[id, offset] : buffer_id_to_offset) {
-        buffer_id_to_addr[id] = buffer->ref(offset);
-    }
-    return buffer_id_to_addr;
-}
+    const std::string &name() const { return name_; }
 
-bool Executor::Impl::add_kernel_arg(size_t buf_id, bool is_external) {
-    bool reused_buffer =
-        buffer_id_to_addr_.find(buf_id) != buffer_id_to_addr_.end();
-    if (!is_external && !reused_buffer) {
-        return false;
-    }
-    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
-    const std::string name = "extern_buf_" + std::to_string(buf_id);
-    if (reused_buffer) {
-        // The buffer is being reused from a previous plan
-        void *buf_addr = buffer_id_to_addr_[buf_id];
-        buffer_id_to_kernel_arg_[buf_id] = std::make_pair(name, buf_addr);
-    } else {
-        // The buffer is external (can either be staged/non-staged)
-        buffer_id_to_kernel_arg_[buf_id] =
-            std::make_pair(name, ext_buf_reg.get(buf_id));
+    std::shared_ptr<GpuMemory> buffer() const { return buffer_; }
+
+    void launch_kernel(const std::string &name, const std::vector<void *> &args,
+                       gpuStream stream);
+
+   private:
+    void verify_plan();
+    void init_comm_resource();
+    void init_internal_buffers();
+    void init_comm_connections();
+    void init_kernel();
+
+    PlanJson plan_json_;
+    int device_id_;
+    std::string name_;
+    std::shared_ptr<CommResource> &comm_resource_;
+
+    int rank_;
+    int world_size_;
+    std::shared_ptr<GpuMemory> buffer_;
+    std::map<size_t, size_t> internal_buffer_id_to_offset_;
+    // extra buffers: external buffers or buffers that are allocated by other
+    // plans
+    std::set<size_t> extra_buffer_ids_;
+    std::shared_ptr<GpuKernel> kernel_;
+};
+
+PlanResource::PlanResource(const PlanJson &plan_json, int device_id,
+                           const std::string &name,
+                           std::shared_ptr<CommResource> &comm_resource)
+    : plan_json_(plan_json),
+      device_id_(device_id),
+      name_(name),
+      comm_resource_(comm_resource) {
+    if (device_id < 0) {
+        ERR(InvalidUsageError, "Invalid device ID ", device_id);
     }
 
-    return true;
+    // Verify if `plan_json` is describes a valid plan
+    verify_plan();
+
+    // Construct `comm_resource_` if needed
+    init_comm_resource();
+
+    // Allocate memory for internal buffers and construct
+    // `internal_buffer_id_to_offset_` and `extra_buffer_ids_`.
+    init_internal_buffers();
+
+    // Create connections and channels to remote ranks
+    init_comm_connections();
+
+    // Construct `kernel_`.
+    init_kernel();
 }
 
-std::vector<void *> Executor::Impl::add_kernel_addr(
-    const std::unordered_map<Tensor, void *> &placeholder_data) {
-    std::unordered_map<size_t, void *> buffer_id_to_placeholder;
-    for (const auto &[tensor, ptr] : placeholder_data) {
-        buffer_id_to_placeholder[tensor.ref()->buffer()->id()] = ptr;
+void PlanResource::verify_plan() {
+    rank_ = plan_json_["Rank"];
+    world_size_ = plan_json_["WorldSize"];
+    if (rank_ < 0 || rank_ >= world_size_) {
+        ERR(InvalidUsageError, "Invalid rank ", rank_, " with world size ",
+            world_size_);
     }
+    auto gpu_manager = GpuManager::get_instance(device_id_);
+    if (!gpu_manager->info().arch->belongs_to(
+            Arch::from_name(plan_json_.at("Architecture")))) {
+        LOG(WARN, "Architecture name of the plan `",
+            plan_json_.at("Architecture").get<std::string>(),
+            "` is not compatible with the GPU architecture `",
+            gpu_manager->info().arch->name(), "`.");
+    }
+}
 
-    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
-    std::vector<void *> kernel_arg_addrs;
-    kernel_arg_addrs.reserve(buffer_id_to_kernel_arg_.size());
-
-    for (const auto &[buf_id, _] : buffer_id_to_kernel_arg_) {
-        void *buf_addr = nullptr;
-        // Check for reused tensor
-        if (auto it = buffer_id_to_addr_.find(buf_id);
-            it != buffer_id_to_addr_.end()) {
-            buf_addr = it->second;
-        }
-        // Check for external tensor (non-staged)
-        else if (void *ext_buf_addr = ext_buf_reg.get(buf_id);
-                 ext_buf_addr != nullptr) {
-            buf_addr = ext_buf_addr;
-        }
-        // Check for external tensor (staged)
-        else if (auto it = buffer_id_to_placeholder.find(buf_id);
-                 it != buffer_id_to_placeholder.end()) {
-            buf_addr = it->second;
-        }
-        if (buf_addr == nullptr) {
-            ERR(InvalidUsageError, "Buffer with id ", buf_id,
-                " did not receive initializing data.");
+void PlanResource::init_comm_resource() {
+    if (comm_resource_) {
+        if (comm_resource_->rank() != rank_) {
+            ERR(InvalidUsageError,
+                "Rank should be consistent across all plans. "
+                "Expected ",
+                rank_, " but got ", comm_resource_->rank());
         }
-        gpuPointerAttributes attr;
-        GLOG(gpuPointerGetAttributes(&attr, buf_addr));
-        if (attr.device != device_id_) {
-            ERR(InvalidUsageError, "Data for buffer id ", buf_id,
-                " is on a different GPU: ", attr.device, " vs ", device_id_);
+        if (comm_resource_->world_size() != world_size_) {
+            ERR(InvalidUsageError,
+                "World size should be consistent across all "
+                "plans. Expected ",
+                world_size_, " but got ", comm_resource_->world_size());
         }
-        kernel_arg_addrs.push_back(buf_addr);
+    } else if (world_size_ > 1) {
+        comm_resource_ =
+            std::make_shared<CommResource>(device_id_, rank_, world_size_);
     }
-    return kernel_arg_addrs;
 }
 
-std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
+void PlanResource::init_internal_buffers() {
     class BufferInfo {
        public:
         BufferInfo(const std::shared_ptr<ModelBuffer> buffer)
             : buffer(buffer), bytes(0), is_input(true), is_output(true) {}
 
-        // ID of this buffer
+        // Underlying ModelBuffer
         const std::shared_ptr<ModelBuffer> buffer;
 
         // Total bytes of this buffer
@@ -422,17 +494,17 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         std::set<size_t> task_ids;
     };
 
-    std::map<size_t, size_t> buffer_id_to_offset;
     std::map<size_t, std::shared_ptr<BufferInfo>> buffer_id_to_info;
 
     auto get_or_create_buffer_info = [&](const Json &buffer_json) {
         auto buffer = ModelBuffer::deserialize(buffer_json);
-        if (buffer_id_to_info.find(buffer->id()) == buffer_id_to_info.end()) {
+        auto it = buffer_id_to_info.find(buffer->id());
+        if (it == buffer_id_to_info.end()) {
             auto buf_info = std::make_shared<BufferInfo>(buffer);
             buffer_id_to_info[buffer->id()] = buf_info;
             return buf_info;
         }
-        return buffer_id_to_info[buffer->id()];
+        return it->second;
     };
 
     auto retrieve_buffer_info = [&](const Json &tensor, size_t task_id,
@@ -447,7 +519,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         buf_info->task_ids.insert(task_id);
     };
 
-    for (auto &task_info : plan_json["TaskInfos"]) {
+    for (auto &task_info : plan_json_["TaskInfos"]) {
         for (auto &op : task_info["Ops"]) {
             size_t task_id = task_info["Id"].get<size_t>();
             for (auto &tns : op["ReadTensors"]) {
@@ -469,37 +541,50 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
     std::map<int, std::map<int, size_t>> remote_rank_to_send_tag_to_buffer_id;
     std::map<int, std::map<int, size_t>> remote_rank_to_recv_tag_to_buffer_id;
 
-    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
+    auto is_remote = [&](const std::shared_ptr<ModelBuffer> &buffer) {
+        return buffer->rank() != rank_ && buffer->rank() != -1;
+    };
 
     // TODO: improve memory planning
     size_t offset = 0;
-    for (auto &kv : buffer_id_to_info) {
-        auto &buf_info = kv.second;
-        int r = buf_info->buffer->rank();
-        const size_t buf_id = buf_info->buffer->id();
-        if (r != rank_ && r != -1) {
+    for (auto &[buf_id, buf_info] : buffer_id_to_info) {
+        auto &buffer = buf_info->buffer;
+        if (is_remote(buffer)) {
             // this is a remote buffer
-            for (const auto &tag_info : buf_info->buffer->send_tags()) {
-                remote_rank_to_send_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] = buf_id;
+            if (buffer->is_external()) {
+                ERR(InvalidUsageError,
+                    "Communication with external buffers is not supported");
+            }
+            int r = buffer->rank();
+            for (const auto &tag_info : buffer->send_tags()) {
+                // This remote buffer will send data to local buffers
+                remote_rank_to_send_tag_to_buffer_id[r][tag_info.second] =
+                    buf_id;
             }
-            for (const auto &tag_info : buf_info->buffer->recv_tags()) {
-                remote_rank_to_recv_tag_to_buffer_id[buf_info->buffer->rank()]
-                                                    [tag_info.second] = buf_id;
+            for (const auto &tag_info : buffer->recv_tags()) {
+                // This remote buffer will receive data from local buffers
+                remote_rank_to_recv_tag_to_buffer_id[r][tag_info.second] =
+                    buf_id;
             }
             continue;
         }
-        if (add_kernel_arg(buf_id, buf_info->buffer->is_external())) {
-            continue;
+        auto info = BufferRegistry::get_instance().get(buf_id);
+        if (info || buffer->is_external()) {
+            // This buffer is external or has been already allocated by a
+            // previous plan.
+            extra_buffer_ids_.insert(buf_id);
         } else {
-            buffer_id_to_offset[buf_id] = offset;
-            for (const auto &tag_info : buf_info->buffer->send_tags()) {
+            // Assign an offset to this internal local buffer
+            internal_buffer_id_to_offset_[buf_id] = offset;
+            for (const auto &tag_info : buffer->send_tags()) {
+                // This local buffer will send data to remote ranks
                 remote_rank_to_send_tags_and_offsets[tag_info.first]
                     .first.push_back(tag_info.second);
                 remote_rank_to_send_tags_and_offsets[tag_info.first]
                     .second.push_back(offset);
             }
-            for (const auto &tag_info : buf_info->buffer->recv_tags()) {
+            for (const auto &tag_info : buffer->recv_tags()) {
+                // This local buffer will receive data from remote ranks
                 remote_rank_to_recv_tags_and_offsets[tag_info.first]
                     .first.push_back(tag_info.second);
                 remote_rank_to_recv_tags_and_offsets[tag_info.first]
@@ -508,7 +593,17 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
             offset += buf_info->bytes;
         }
     }
-    total_bytes_ = offset;
+    size_t total_bytes = offset;
+
+    // Allocate memory for internal local buffers
+    if (total_bytes > 0) {
+        buffer_ =
+            GpuManager::get_instance(device_id_)->malloc(total_bytes, 65536);
+        for (auto &[buf_id, buf_offset] : internal_buffer_id_to_offset_) {
+            BufferRegistry::get_instance().set(buf_id, buffer_->ref(buf_offset),
+                                               device_id_, false);
+        }
+    }
 
     //
     // Send each tag (SendTag or RecvTag) and the corresponding offset to
@@ -550,7 +645,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         auto &tags = tags_and_offsets.first;
         auto &offsets = tags_and_offsets.second;
         int len = tags.size();
-        auto bootstrap = comm_->bootstrap();
+        auto bootstrap = comm_resource_->bootstrap();
         bootstrap->send(&len, sizeof(int), remote_rank, 0);
         bootstrap->send(tags.data(), tags.size() * sizeof(int), remote_rank, 1);
         bootstrap->send(offsets.data(), offsets.size() * sizeof(size_t),
@@ -563,7 +658,7 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         auto &tags = tags_and_offsets.first;
         auto &offsets = tags_and_offsets.second;
         int len = tags.size();
-        auto bootstrap = comm_->bootstrap();
+        auto bootstrap = comm_resource_->bootstrap();
         bootstrap->send(&len, sizeof(int), remote_rank, 3);
         bootstrap->send(tags.data(), tags.size() * sizeof(int), remote_rank, 4);
         bootstrap->send(offsets.data(), offsets.size() * sizeof(size_t),
@@ -575,20 +670,21 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         std::vector<int> tags;
         std::vector<size_t> offsets;
         int len;
-        auto bootstrap = comm_->bootstrap();
+        auto bootstrap = comm_resource_->bootstrap();
         bootstrap->recv(&len, sizeof(int), remote_rank, 0);
         tags.resize(len);
         offsets.resize(len);
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 1);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 2);
         for (int i = 0; i < len; ++i) {
-            const size_t buf_id =
-                buffer_id_to_info[send_tag_to_buffer_id[tags[i]]]->buffer->id();
-            void *buf_data = ext_buf_reg.get(buf_id);
-            if (buf_data == nullptr) {
-                buffer_id_to_offset[send_tag_to_buffer_id[tags[i]]] =
-                    offsets[i];
+            auto it = send_tag_to_buffer_id.find(tags[i]);
+            if (it == send_tag_to_buffer_id.end()) {
+                LOG(WARN, "Send tag ", tags[i], " from remote rank ",
+                    remote_rank, " is unexpected");
+                continue;
             }
+            size_t buf_id = it->second;
+            internal_buffer_id_to_offset_[buf_id] = offsets[i];
         }
     }
     for (auto &kv : remote_rank_to_recv_tag_to_buffer_id) {
@@ -597,270 +693,287 @@ std::map<size_t, size_t> Executor::Impl::init_buffers(const Json &plan_json) {
         std::vector<int> tags;
         std::vector<size_t> offsets;
         int len;
-        auto bootstrap = comm_->bootstrap();
+        auto bootstrap = comm_resource_->bootstrap();
         bootstrap->recv(&len, sizeof(int), remote_rank, 3);
         tags.resize(len);
         offsets.resize(len);
         bootstrap->recv(tags.data(), len * sizeof(int), remote_rank, 4);
         bootstrap->recv(offsets.data(), len * sizeof(size_t), remote_rank, 5);
         for (int i = 0; i < len; ++i) {
-            const size_t buf_id =
-                buffer_id_to_info[recv_tag_to_buffer_id[tags[i]]]->buffer->id();
-            void *buf_data = ext_buf_reg.get(buf_id);
-            if (buf_data == nullptr) {
-                buffer_id_to_offset[recv_tag_to_buffer_id[tags[i]]] =
-                    offsets[i];
+            auto it = recv_tag_to_buffer_id.find(tags[i]);
+            if (it == recv_tag_to_buffer_id.end()) {
+                LOG(WARN, "Recv tag ", tags[i], " from remote rank ",
+                    remote_rank, " is unexpected");
+                continue;
             }
+            size_t buf_id = it->second;
+            internal_buffer_id_to_offset_[buf_id] = offsets[i];
         }
     }
-    return buffer_id_to_offset;
 }
 
-std::set<int> Executor::Impl::init_remote_ranks(const Json &plan_json) const {
-    std::set<int> remote_ranks;
-    for (auto &task_info : plan_json["TaskInfos"]) {
-        for (auto &op : task_info["Ops"]) {
-            for (auto &tns : op["ReadTensors"]) {
-                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
-                if (buffer->rank() != rank_ && buffer->rank() != -1) {
-                    remote_ranks.insert(buffer->rank());
-                }
-            }
-            for (auto &tns : op["WriteTensors"]) {
-                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
-                if (buffer->rank() != rank_ && buffer->rank() != -1) {
-                    remote_ranks.insert(buffer->rank());
-                }
-            }
-            for (auto &tns : op["ResultTensors"]) {
-                auto buffer = ModelBuffer::deserialize(tns["Buffer"]);
-                if (buffer->rank() != rank_ && buffer->rank() != -1) {
-                    remote_ranks.insert(buffer->rank());
-                }
-            }
-        }
+void PlanResource::init_comm_connections() {
+    if (comm_resource_ && buffer_) {
+        comm_resource_->connect(plan_json_, buffer_);
     }
-    return remote_ranks;
 }
 
-void Executor::Impl::init_channels(const std::set<int> &remote_ranks) {
-    if (!proxy_service_) {
-        proxy_service_ = std::make_shared<mscclpp::ProxyService>();
-    }
+void PlanResource::init_kernel() {
+    auto gpu_manager = GpuManager::get_instance(device_id_);
+    auto codegen = std::make_shared<CodeGenerator>(
+        plan_json_, internal_buffer_id_to_offset_, extra_buffer_ids_);
+    int num_sm = static_cast<int>(codegen->num_procs());
+    int threads_per_block = static_cast<int>(
+        codegen->num_warps_per_proc() * gpu_manager->info().threads_per_warp);
+    size_t smem_block_total =
+        static_cast<size_t>(gpu_manager->info().smem_block_total);
 
-    int num_ranks_per_node = get_env().num_ranks_per_host;
-    auto rank_to_node = [&](int rank) { return rank / num_ranks_per_node; };
-    int this_node = rank_to_node(rank_);
+    kernel_ = std::shared_ptr<GpuKernel>(
+        new GpuKernel(device_id_, codegen->code(), {threads_per_block, 1, 1},
+                      {num_sm, 1, 1}, std::max(smem_block_total, size_t(4))));
+    kernel_->compile();
 
-    const mscclpp::Transport IBs[] = {
-        mscclpp::Transport::IB0, mscclpp::Transport::IB1,
-        mscclpp::Transport::IB2, mscclpp::Transport::IB3,
-        mscclpp::Transport::IB4, mscclpp::Transport::IB5,
-        mscclpp::Transport::IB6, mscclpp::Transport::IB7};
+    if (world_size_ <= 1) return;
 
-    mscclpp::TransportFlags all_transports =
-        mscclpp::Transport::CudaIpc | mscclpp::Transport::Ethernet;
-    if (!get_env().disable_ib) {
-        all_transports |= IBs[device_id_];
+    auto get_global_rt = [&](const std::string &symbol) {
+        return reinterpret_cast<void *>(kernel_->get_global(symbol));
+    };
+    void *proxy_chan_addr = get_global_rt("ARK_PROXY_CHANS");
+    void *proxy_secondary_chan_addr =
+        get_global_rt("ARK_PROXY_SECONDARY_CHANS");
+    void *sm_chan_addr = get_global_rt("ARK_SM_CHANS");
+    std::vector<mscclpp::SimpleProxyChannel::DeviceHandle> proxy_handles(
+        world_size_);
+    std::vector<mscclpp::SimpleProxyChannel::DeviceHandle>
+        proxy_secondary_handles(world_size_);
+    std::vector<mscclpp::SmChannel::DeviceHandle> sm_handles(world_size_);
+    for (int i = 0; i < world_size_; i++) {
+        if (i == rank_) continue;
+        auto resource = comm_resource_->resource(i);
+        if (!resource) continue;
+        std::vector<mscclpp::SimpleProxyChannel::DeviceHandle> p_hdls;
+        if (resource->ipc) {
+            sm_handles[i] = resource->ipc->sm_channels[0]->deviceHandle();
+            p_hdls.push_back(resource->ipc->proxy_channels[0]->deviceHandle());
+        }
+        if (resource->ib) {
+            p_hdls.push_back(resource->ib->proxy_channels[0]->deviceHandle());
+        }
+        if (resource->eth) {
+            p_hdls.push_back(resource->eth->proxy_channels[0]->deviceHandle());
+        }
+        if (p_hdls.size() > 0) {
+            proxy_handles[i] = p_hdls[0];
+        }
+        if (p_hdls.size() > 1) {
+            proxy_secondary_handles[i] = p_hdls[1];
+        }
     }
-    mscclpp::RegisteredMemory regmem = comm_->registerMemory(
-        buffers_.back()->ref(), buffers_.back()->bytes(), all_transports);
-
-    std::map<int, std::vector<mscclpp::NonblockingFuture<
-                      std::shared_ptr<mscclpp::Connection>>>>
-        rank_to_connections_future;
-    std::map<int, mscclpp::NonblockingFuture<mscclpp::RegisteredMemory>>
-        rank_to_remote_regmem_future;
+    auto tmp_stream = gpu_manager->create_stream();
+    GLOG(gpuSetDevice(device_id_));
+    GLOG(gpuMemcpyAsync(proxy_chan_addr, proxy_handles.data(),
+                        proxy_handles.size() *
+                            sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
+                        gpuMemcpyHostToDevice, tmp_stream->get()));
+    GLOG(gpuMemcpyAsync(proxy_secondary_chan_addr,
+                        proxy_secondary_handles.data(),
+                        proxy_secondary_handles.size() *
+                            sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
+                        gpuMemcpyHostToDevice, tmp_stream->get()));
+    GLOG(gpuMemcpyAsync(
+        sm_chan_addr, sm_handles.data(),
+        sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle),
+        gpuMemcpyHostToDevice, tmp_stream->get()));
+    GLOG(gpuStreamSynchronize(tmp_stream->get()));
+}
 
-    for (auto remote_rank : remote_ranks) {
-        int remote_node = rank_to_node(remote_rank);
-        auto add_connection = [&](int remote_rank,
-                                  mscclpp::Transport transport) {
-            rank_to_connections_future[remote_rank].push_back(
-                comm_->connectOnSetup(remote_rank, 0, transport));
-        };
-        if (remote_node == this_node) {
-            add_connection(remote_rank, mscclpp::Transport::CudaIpc);
-            if (!get_env().disable_ib) {
-                add_connection(remote_rank, IBs[device_id_]);
-            }
-        } else {
-            add_connection(remote_rank, get_env().disable_ib
-                                            ? mscclpp::Transport::Ethernet
-                                            : IBs[device_id_]);
+void PlanResource::launch_kernel(const std::string &name,
+                                 const std::vector<void *> &args,
+                                 gpuStream stream) {
+    std::vector<void *> kernel_args = args;
+    for (size_t id : extra_buffer_ids_) {
+        auto info = BufferRegistry::get_instance().get(id);
+        if (!info) {
+            ERR(InternalError, "External buffer not found.");
+        } else if (info->data == nullptr) {
+            ERR(InvalidUsageError, "External buffer data is nullptr.");
         }
-        comm_->sendMemoryOnSetup(regmem, remote_rank, 0);
-        rank_to_remote_regmem_future[remote_rank] =
-            comm_->recvMemoryOnSetup(remote_rank, 0);
+        kernel_args.push_back(&(info->data));
     }
-    comm_->setup();
+    kernel_->launch(name, stream, kernel_args);
+}
 
-    std::map<int, std::vector<std::shared_ptr<mscclpp::Connection>>>
-        rank_to_connections;
-    for (auto &kv : rank_to_connections_future) {
-        for (auto &future : kv.second) {
-            rank_to_connections[kv.first].push_back(future.get());
-        }
+class Executor::Impl {
+   public:
+    Impl(){};
+    ~Impl();
+
+    int device_id() const {
+        return foreground_plan_resource_
+                   ? foreground_plan_resource_->device_id()
+                   : -1;
     }
-    for (auto &kv : rank_to_connections) {
-        for (auto &conn : kv.second) {
-            rank_to_proxy_channels_[kv.first].push_back(
-                std::make_shared<mscclpp::SimpleProxyChannel>(
-                    proxy_service_->proxyChannel(
-                        proxy_service_->buildAndAddSemaphore(*comm_, conn)),
-                    proxy_service_->addMemory(
-                        rank_to_remote_regmem_future[kv.first].get()),
-                    proxy_service_->addMemory(regmem)));
-        }
+
+    Stream stream() const { return reinterpret_cast<Stream>(stream_raw_); }
+
+    std::shared_ptr<GpuMemory> buffer() const {
+        return foreground_plan_resource_ ? foreground_plan_resource_->buffer()
+                                         : nullptr;
     }
-    comm_->setup();
 
-    std::map<int,
-             std::vector<std::shared_ptr<mscclpp::SmDevice2DeviceSemaphore>>>
-        sm_semaphores;
-    for (auto &kv : rank_to_connections) {
-        for (auto &conn : kv.second) {
-            if (conn->transport() != mscclpp::Transport::CudaIpc) continue;
-            sm_semaphores[kv.first].push_back(
-                std::make_shared<mscclpp::SmDevice2DeviceSemaphore>(*comm_,
-                                                                    conn));
-        }
+    std::string plan() const {
+        return foreground_plan_resource_
+                   ? foreground_plan_resource_->plan_json().dump_pretty()
+                   : "";
     }
-    comm_->setup();
 
-    for (auto &kv : sm_semaphores) {
-        for (auto &sem : kv.second) {
-            rank_to_sm_channels_[kv.first].push_back(
-                std::make_shared<mscclpp::SmChannel>(
-                    sem, rank_to_remote_regmem_future[kv.first].get(),
-                    regmem.data(), nullptr));
-        }
+    std::string name() const {
+        return foreground_plan_resource_ ? foreground_plan_resource_->name()
+                                         : "";
     }
+
+    void compile(const std::string &plan, int device_id,
+                 const std::string &name);
+    void launch(const std::unordered_map<Tensor, void *> &placeholder_data,
+                Stream stream, bool loop_mode, bool record);
+    void run(int iter,
+             const std::unordered_map<Tensor, void *> &placeholder_data);
+    void wait(int64_t max_spin_count);
+    float stop(int64_t max_spin_count);
+    void barrier();
+
+    void *tensor_address(const Tensor &tensor) const;
+
+    void tensor_read(const Tensor &tensor, void *data, size_t bytes,
+                     Stream stream, bool is_d2d) const;
+    void tensor_write(const Tensor &tensor, const void *data, size_t bytes,
+                      Stream stream, bool is_d2d) const;
+
+   protected:
+    friend class DefaultExecutor;
+
+    gpuStream stream_raw_;
+    bool loop_mode_;
+
+   private:
+    std::shared_ptr<BufferRegistry::Info> get_buffer_info(
+        const Tensor &tensor) const;
+
+    std::map<PlanResourceKey, std::shared_ptr<PlanResource>> plan_resources_;
+    std::shared_ptr<PlanResource> foreground_plan_resource_;
+    std::shared_ptr<CommResource> comm_resource_;
+
+    bool is_launched_ = false;
+    bool is_recording_ = false;
+    float elapsed_msec_ = -1;
+
+    std::shared_ptr<GpuEvent> timer_begin_;
+    std::shared_ptr<GpuEvent> timer_end_;
+    std::shared_ptr<GpuHostMemory> flag_;
+    std::shared_ptr<GpuStream> stream_;
+};
+
+Executor::Impl::~Impl() {
+    if (is_launched_) stop(-1);
 }
 
 void Executor::Impl::compile(const std::string &plan, int device_id,
                              const std::string &name) {
     if (is_launched_) {
         ERR(InvalidUsageError, "Need to stop before re-compiling.");
-        return;
     }
-    try {
-        auto plan_json = Json::parse(plan);
-        init(plan_json, device_id, name);
-    } catch (const ::nlohmann::json::parse_error &e) {
-        ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
+    int prev_device_id = -1;
+    if (foreground_plan_resource_) {
+        prev_device_id = foreground_plan_resource_->device_id();
+    }
+    if (prev_device_id != device_id) {
+        auto gpu_manager = GpuManager::get_instance(device_id);
+        timer_begin_ = gpu_manager->create_event();
+        timer_end_ = gpu_manager->create_event();
+        flag_ = gpu_manager->malloc_host(
+            sizeof(int), gpuHostAllocMapped | gpuHostAllocWriteCombined);
+        stream_ = gpu_manager->create_stream();
+    }
+    PlanResourceKey key(plan, device_id, name);
+    auto it = plan_resources_.find(key);
+    if (it == plan_resources_.end()) {
+        try {
+            auto plan_json = Json::parse(plan);
+            auto resource = std::make_shared<PlanResource>(
+                plan_json, device_id, name, comm_resource_);
+            plan_resources_[key] = resource;
+            foreground_plan_resource_ = resource;
+        } catch (const ::nlohmann::json::parse_error &e) {
+            ERR(InvalidUsageError, "Failed to parse the plan JSON: ", e.what());
+        }
+    } else {
+        foreground_plan_resource_ = it->second;
     }
-    kernel_->compile();
 }
 
 void Executor::Impl::launch(
-    Stream stream, bool loop_mode,
-    const std::unordered_map<Tensor, void *> &placeholder_data) {
-    if ((kernel_ == nullptr) || !kernel_->is_compiled()) {
+    const std::unordered_map<Tensor, void *> &placeholder_data, Stream stream,
+    bool loop_mode, bool record) {
+    if (!foreground_plan_resource_) {
         ERR(InvalidUsageError, "Need to compile first before launch.");
     }
     if (is_launched_) {
         LOG(WARN, "Ignore launching twice.");
         return;
     }
-    if (stream) {
-        stream_raw_ = reinterpret_cast<gpuStream>(stream);
-    } else {
-        stream_ = GpuManager::get_instance(device_id_)->create_stream();
-        stream_raw_ = stream_->get();
-    }
-    loop_mode_ = loop_mode;
-
-    if (loop_mode_) {
-        kernel_name_ = "ark_loop_kernel";
-    } else {
-        kernel_name_ = "ark_kernel";
-    }
-
-    auto get_global_rt = [&](const std::string &symbol) {
-        return reinterpret_cast<void *>(kernel_->get_global(symbol));
-    };
-    if (world_size_ > 1) {
-        void *proxy_chan_addr = get_global_rt("ARK_PROXY_CHANS");
-        void *proxy_secondary_chan_addr =
-            get_global_rt("ARK_PROXY_SECONDARY_CHANS");
-        void *sm_chan_addr = get_global_rt("ARK_SM_CHANS");
-        std::vector<mscclpp::SimpleProxyChannel::DeviceHandle> proxy_handles(
-            world_size_);
-        std::vector<mscclpp::SimpleProxyChannel::DeviceHandle>
-            proxy_secondary_handles(world_size_);
-        std::vector<mscclpp::SmChannel::DeviceHandle> sm_handles(world_size_);
-        for (int i = 0; i < world_size_; i++) {
-            auto it = rank_to_proxy_channels_.find(i);
-            if (it != rank_to_proxy_channels_.end() && it->second.size() > 0) {
-                proxy_handles[i] = it->second[0]->deviceHandle();
-                if (it->second.size() > 1) {
-                    proxy_secondary_handles[i] = it->second[1]->deviceHandle();
-                }
-            }
-            auto it2 = rank_to_sm_channels_.find(i);
-            if (it2 != rank_to_sm_channels_.end() && it2->second.size() > 0) {
-                sm_handles[i] = it2->second[0]->deviceHandle();
-            }
+    for (const auto &[tensor, ptr] : placeholder_data) {
+        if (tensor.ref()->data(ptr) != ptr) {
+            ERR(InvalidUsageError,
+                "Placeholder data must be external tensors.");
         }
-        GLOG(gpuSetDevice(device_id_));
-        GLOG(gpuMemcpyAsync(
-            proxy_chan_addr, proxy_handles.data(),
-            proxy_handles.size() *
-                sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, stream_raw_));
-        GLOG(gpuMemcpyAsync(
-            proxy_secondary_chan_addr, proxy_secondary_handles.data(),
-            proxy_secondary_handles.size() *
-                sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, stream_raw_));
-        GLOG(gpuMemcpyAsync(
-            sm_chan_addr, sm_handles.data(),
-            sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle),
-            gpuMemcpyHostToDevice, stream_raw_));
-        GLOG(gpuStreamSynchronize(stream_raw_));
     }
 
+    stream_raw_ = stream ? reinterpret_cast<gpuStream>(stream) : stream_->get();
+    loop_mode_ = loop_mode;
     elapsed_msec_ = -1;
-    timer_begin_->record(stream_raw_);
 
-    if (world_size_ > 1) {
-        proxy_service_->startProxy();
+    if (record) {
+        timer_begin_->record(stream_raw_);
+        is_recording_ = true;
+    }
+    if (comm_resource_) {
+        comm_resource_->proxy_service()->startProxy();
     }
 
     if (loop_mode_) {
         // Initialize loop flags.
         atomicStoreRelaxed(flag_->ref<int>(), 0);
-        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
+        auto buffer = foreground_plan_resource_->buffer();
+        void *buf_ptr = buffer ? buffer->ref() : nullptr;
         void *flag_ptr = flag_->ref();
         std::vector<void *> args = {&buf_ptr, &flag_ptr};
-        auto addr_args = add_kernel_addr(placeholder_data);
-        for (auto &ptr : addr_args) {
-            args.push_back(&ptr);
-        }
-        kernel_->launch(kernel_name_, stream_raw_, args);
+        foreground_plan_resource_->launch_kernel("ark_loop_kernel", args,
+                                                 stream_raw_);
     }
-    is_recording_ = true;
     is_launched_ = true;
 }
 
 void Executor::Impl::run(
     int iter, const std::unordered_map<Tensor, void *> &placeholder_data) {
+    for (const auto &[tensor, ptr] : placeholder_data) {
+        if (tensor.ref()->data(ptr) != ptr) {
+            ERR(InvalidUsageError,
+                "Placeholder data must be external tensors.");
+        }
+    }
     if (iter <= 0) return;
     if (loop_mode_) {
         while (atomicLoadRelaxed(flag_->ref<int>()) > 0) {
         }
         atomicStoreRelaxed(flag_->ref<int>(), iter);
     } else {
-        void *buf_ptr = (buffers_.empty()) ? nullptr : buffers_.back()->ref();
+        auto buffer = foreground_plan_resource_->buffer();
+        void *buf_ptr = buffer ? buffer->ref() : nullptr;
         int i = 0;
         std::vector<void *> args = {&buf_ptr, reinterpret_cast<void *>(&i)};
-        auto addr_arg = add_kernel_addr(placeholder_data);
-        for (auto &ptr : addr_arg) {
-            args.push_back(&ptr);
-        }
         for (; i < iter; i++) {
-            kernel_->launch(kernel_name_, stream_raw_, args);
+            foreground_plan_resource_->launch_kernel("ark_kernel", args,
+                                                     stream_raw_);
         }
     }
 }
@@ -912,37 +1025,42 @@ float Executor::Impl::stop(int64_t max_spin_count) {
         is_recording_ = false;
     }
     is_launched_ = false;
-    if (world_size_ > 1) {
-        proxy_service_->stopProxy();
+    if (comm_resource_) {
+        comm_resource_->proxy_service()->stopProxy();
     }
     return elapsed_msec_;
 }
 
 void Executor::Impl::barrier() {
-    if (world_size_ > 1) {
-        comm_->bootstrap()->barrier();
+    if (comm_resource_) {
+        comm_resource_->bootstrap()->barrier();
     }
 }
 
-void *Executor::Impl::tensor_address(const Tensor &tensor) const {
+std::shared_ptr<BufferRegistry::Info> Executor::Impl::get_buffer_info(
+    const Tensor &tensor) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
-    auto &ext_buf_reg = ExternalBufferRegistry::get_instance();
-    void *ext_data = ext_buf_reg.get(buffer_id);
-    if (ext_data) {
-        return ext_data;
-    }
-    if (buffer_id_to_addr_.find(buffer_id) == buffer_id_to_addr_.end()) {
-        ERR(InvalidUsageError, "Tensor has an unknown buffer ID ", buffer_id,
-            ". This is likely caused by accessing a tensor that is optimized "
+    auto &buf_reg = BufferRegistry::get_instance();
+    auto info = buf_reg.get(buffer_id);
+    if (!info || !(info->data)) {
+        ERR(InvalidUsageError,
+            "Tensor has no allocated memory. "
+            "This is likely caused by accessing a tensor that is optimized "
             "out by the compiler or not used in any plan passed to the "
             "executor.");
     }
-    return buffer_id_to_addr_.at(buffer_id);
+    return info;
+}
+
+void *Executor::Impl::tensor_address(const Tensor &tensor) const {
+    return get_buffer_info(tensor)->data;
 }
 
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
-    GLOG(gpuSetDevice(device_id_));
+    auto info = get_buffer_info(tensor);
+    size_t device_id = info->device_id;
+    GLOG(gpuSetDevice(device_id));
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -953,7 +1071,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                 "may cause a deadlock.");
         }
     } else {
-        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream = GpuManager::get_instance(device_id)->create_stream();
         copy_stream_raw = copy_stream->get();
     }
     size_t tensor_data_bytes =
@@ -963,7 +1081,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
             ") mismatches the tensor data bytes (", tensor_data_bytes, ").");
     }
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyDeviceToHost;
-    void *src = tensor_address(tensor);
+    void *src = info->data;
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(data, src, bytes, kind, copy_stream_raw));
     } else {
@@ -993,7 +1111,9 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
 void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
-    GLOG(gpuSetDevice(device_id_));
+    auto info = get_buffer_info(tensor);
+    size_t device_id = info->device_id;
+    GLOG(gpuSetDevice(device_id));
     std::shared_ptr<GpuStream> copy_stream;
     gpuStream copy_stream_raw;
     if (stream) {
@@ -1004,7 +1124,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                 "may cause a deadlock.");
         }
     } else {
-        copy_stream = GpuManager::get_instance(device_id_)->create_stream();
+        copy_stream = GpuManager::get_instance(device_id)->create_stream();
         copy_stream_raw = copy_stream->get();
     }
     size_t tensor_data_bytes =
@@ -1016,7 +1136,7 @@ void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
     size_t tensor_bytes =
         tensor.strides().nelems() * tensor.data_type().bytes();
     auto kind = (is_d2d) ? gpuMemcpyDeviceToDevice : gpuMemcpyHostToDevice;
-    void *dst = tensor_address(tensor);
+    void *dst = info->data;
     if (tensor.strides() == tensor.shape()) {
         GLOG(gpuMemcpyAsync(dst, data, tensor_bytes, kind, copy_stream_raw));
     } else {
@@ -1057,7 +1177,7 @@ std::shared_ptr<GpuMemory> Executor::buffer() const { return impl_->buffer(); }
 
 std::string Executor::plan() const { return impl_->plan(); }
 
-const std::string &Executor::name() const { return impl_->name(); }
+std::string Executor::name() const { return impl_->name(); }
 
 void Executor::compile(const std::string &plan, int device_id,
                        const std::string &name) {
@@ -1065,9 +1185,9 @@ void Executor::compile(const std::string &plan, int device_id,
 }
 
 void Executor::launch(
-    Stream stream, bool loop_mode,
-    const std::unordered_map<Tensor, void *> &placeholder_data) {
-    impl_->launch(stream, loop_mode, placeholder_data);
+    const std::unordered_map<Tensor, void *> &placeholder_data, Stream stream,
+    bool loop_mode, bool record) {
+    impl_->launch(placeholder_data, stream, loop_mode, record);
 }
 
 void Executor::run(int iter,
@@ -1104,8 +1224,8 @@ void Executor::tensor_write(const Tensor &tensor, const void *data,
 DefaultExecutor::DefaultExecutor(
     const Model &model, int device_id, Stream stream,
     const std::vector<Planner::ConfigRule> &config_rules,
-    const std::string &name, bool loop_mode)
-    : Executor() {
+    const std::string &name, bool loop_mode, bool record)
+    : Executor(), record_(record) {
     device_id = (device_id < 0) ? (model.rank() % get_env().num_ranks_per_host)
                                 : device_id;
     Planner planner(model, device_id);
@@ -1119,8 +1239,9 @@ DefaultExecutor::DefaultExecutor(
 
 void DefaultExecutor::launch(
     const std::unordered_map<Tensor, void *> &placeholder_data) {
-    Executor::launch(reinterpret_cast<Stream>(impl_->stream_raw_),
-                     impl_->loop_mode_, placeholder_data);
+    Executor::launch(placeholder_data,
+                     reinterpret_cast<Stream>(impl_->stream_raw_),
+                     impl_->loop_mode_, record_);
 }
 
 }  // namespace ark
diff --git a/ark/api/tensor.cpp b/ark/api/tensor.cpp
index fc44b4a58..103fb8896 100644
--- a/ark/api/tensor.cpp
+++ b/ark/api/tensor.cpp
@@ -68,6 +68,27 @@ Dims Tensor::torch_strides() const {
     return Dims();
 }
 
+void *Tensor::data() const {
+    if (ref_) {
+        return ref_->data();
+    }
+    return nullptr;
+}
+
+void *Tensor::data(void *data) {
+    if (ref_) {
+        return ref_->data(data);
+    }
+    return nullptr;
+}
+
+bool Tensor::is_external() const {
+    if (ref_) {
+        return ref_->is_external();
+    }
+    return false;
+}
+
 std::ostream &operator<<(std::ostream &os, const Tensor &tensor) {
     if (tensor.is_null()) {
         os << "null";
diff --git a/ark/buffer_registry.cpp b/ark/buffer_registry.cpp
new file mode 100644
index 000000000..00c5ea28e
--- /dev/null
+++ b/ark/buffer_registry.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "buffer_registry.hpp"
+
+#include "gpu/gpu_logging.hpp"
+
+namespace ark {
+
+BufferRegistry &BufferRegistry::get_instance() {
+    static BufferRegistry instance;
+    return instance;
+}
+
+void BufferRegistry::set(size_t id, void *data, int device_id,
+                         bool is_external) {
+    if (data != nullptr && device_id < 0) {
+        gpuPointerAttributes attr;
+        GLOG(gpuPointerGetAttributes(&attr, data));
+        device_id = attr.device;
+    }
+    buffers_[id] =
+        std::make_shared<BufferRegistry::Info>(data, device_id, is_external);
+}
+
+std::shared_ptr<BufferRegistry::Info> BufferRegistry::get(size_t id) const {
+    auto it = buffers_.find(id);
+    if (it != buffers_.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+}  // namespace ark
diff --git a/ark/buffer_registry.hpp b/ark/buffer_registry.hpp
new file mode 100644
index 000000000..81a26e722
--- /dev/null
+++ b/ark/buffer_registry.hpp
@@ -0,0 +1,41 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_BUFFER_REGISTRY_HPP_
+#define ARK_BUFFER_REGISTRY_HPP_
+
+#include <memory>
+#include <unordered_map>
+
+namespace ark {
+
+/// Manages addresses of all allocated buffers including externally managed
+/// buffers.
+class BufferRegistry {
+   public:
+    struct Info {
+        Info(void *data, int device_id, bool is_external)
+            : data(data), device_id(device_id), is_external(is_external) {}
+        void *data;
+        int device_id;
+        bool is_external;
+    };
+
+    ~BufferRegistry() {}
+
+    static BufferRegistry &get_instance();
+
+    void set(size_t id, void *data, int device_id, bool is_external);
+
+    std::shared_ptr<Info> get(size_t id) const;
+
+   private:
+    std::unordered_map<size_t, std::shared_ptr<Info>> buffers_;
+    BufferRegistry() {}
+    BufferRegistry(const BufferRegistry &) = delete;
+    BufferRegistry &operator=(const BufferRegistry &) = delete;
+};
+
+}  // namespace ark
+
+#endif  // ARK_BUFFER_REGISTRY_HPP_
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 04c5887fc..23045b0c7 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -7,8 +7,8 @@
 #include <utility>
 
 #include "ark/data_type.hpp"
+#include "buffer_registry.hpp"
 #include "env.h"
-#include "external_buffer_registry.hpp"
 #include "file_io.h"
 #include "logging.hpp"
 #include "model/model_buffer.hpp"
@@ -56,9 +56,7 @@ class CodeGenerator::Impl {
    public:
     Impl(const PlanJson &plan,
          const std::map<size_t, size_t> &buffer_id_to_offset,
-         const std::map<size_t, std::pair<std::string, void *>>
-             &buffer_id_to_kernel_arg,
-         const std::string &name);
+         const std::set<size_t> &extra_buffer_ids, const std::string &name);
     ~Impl() = default;
 
    private:
@@ -83,7 +81,7 @@ class CodeGenerator::Impl {
     friend class CodeGenerator;
 
     std::map<size_t, size_t> buffer_id_to_offset_;
-    std::map<size_t, std::pair<std::string, void *>> buffer_id_to_kernel_arg_;
+    std::set<size_t> extra_buffer_ids_;
     std::string name_;
     int rank_;
     int world_size_;
@@ -94,11 +92,10 @@ class CodeGenerator::Impl {
 
 CodeGenerator::Impl::Impl(const PlanJson &plan,
                           const std::map<size_t, size_t> &buffer_id_to_offset,
-                          const std::map<size_t, std::pair<std::string, void *>>
-                              &buffer_id_to_kernel_arg,
+                          const std::set<size_t> &extra_buffer_ids,
                           const std::string &name)
     : buffer_id_to_offset_(buffer_id_to_offset),
-      buffer_id_to_kernel_arg_(buffer_id_to_kernel_arg),
+      extra_buffer_ids_(extra_buffer_ids),
       name_(name) {
     rank_ = plan.at("Rank");
     world_size_ = plan.at("WorldSize");
@@ -191,8 +188,8 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
 
     // Generate the global arguments
     std::stringstream global_args_ss, function_args_ss, arg_types_ss;
-    for (const auto &[buf_id, kernel_arg] : buffer_id_to_kernel_arg_) {
-        const auto &arg_name = kernel_arg.first;
+    for (auto buf_id : extra_buffer_ids_) {
+        std::string arg_name = "_ext_buf_" + std::to_string(buf_id);
         global_args_ss << "void *" << arg_name << ", ";
         function_args_ss << arg_name << ", ";
         arg_types_ss << "void *, ";
@@ -263,6 +260,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
     }
     ss << "__device__ void t" << task_json["Id"]
        << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
+    auto &buf_reg = BufferRegistry::get_instance();
     op_idx = 0;
     for (auto &op_json : task_json["Ops"]) {
         auto op = ModelOp::deserialize(op_json);
@@ -273,29 +271,36 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
                 size_t buffer_id = tns->buffer()->id();
-                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
-                if (it == buffer_id_to_kernel_arg_.end()) {
-                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                auto it = buffer_id_to_offset_.find(buffer_id);
+                auto buf_info = buf_reg.get(buffer_id);
+                if ((buf_info && buf_info->is_external) ||
+                    (it == buffer_id_to_offset_.end())) {
+                    ss << "(" << tns->data_type()->type_str() << "*)_ext_buf_"
+                       << buffer_id;
+                } else {
+                    size_t buffer_offset;
+                    buffer_offset = it->second;
                     size_t offset = buffer_offset + ModelOffset(tns).value();
                     ss << "(" << tns->data_type()->type_str() << "*)&_buf["
                        << offset << "]";
-                } else {
-                    const auto &name = it->second.first;
-                    ss << "(" << tns->data_type()->type_str() << "*)" << name;
                 }
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_id = moff.buffer_id();
-                auto it = buffer_id_to_kernel_arg_.find(buffer_id);
-                if (it == buffer_id_to_kernel_arg_.end()) {
-                    size_t buffer_offset = buffer_id_to_offset_.at(buffer_id);
+                auto buf_info = buf_reg.get(buffer_id);
+                if (buf_info && buf_info->is_external) {
+                    size_t offset = moff.value();
+                    ss << "(uint64_t)((char*)_ext_buf_" << buffer_id << " + "
+                       << offset << ")";
+                } else {
+                    size_t buffer_offset;
+                    auto it = buffer_id_to_offset_.find(buffer_id);
+                    if (it == buffer_id_to_offset_.end()) {
+                        ERR(InternalError, "buffer ID not found: ", buffer_id);
+                    }
+                    buffer_offset = it->second;
                     size_t offset = buffer_offset + moff.value();
                     ss << offset;
-                } else {
-                    const auto &name = it->second.first;
-                    size_t offset = moff.value();
-                    ss << "(uint64_t)((char*)" << name << " + " << offset
-                       << ")";
                 }
             } else {
                 ss << arg.serialize().begin().value();
@@ -496,11 +501,9 @@ std::string CodeGenerator::Impl::sync_process_range(const Range<size_t> &range,
 
 CodeGenerator::CodeGenerator(
     const PlanJson &plan, const std::map<size_t, size_t> &buffer_id_to_offset,
-    const std::map<size_t, std::pair<std::string, void *>>
-        &buffer_id_to_kernel_arg,
-    const std::string &name)
-    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset,
-                                   buffer_id_to_kernel_arg, name)) {}
+    const std::set<size_t> &extra_buffer_ids, const std::string &name)
+    : impl_(std::make_shared<Impl>(plan, buffer_id_to_offset, extra_buffer_ids,
+                                   name)) {}
 
 std::string CodeGenerator::code() const { return impl_->code_; }
 
diff --git a/ark/codegen.hpp b/ark/codegen.hpp
index 0fccc46e3..9f5947deb 100644
--- a/ark/codegen.hpp
+++ b/ark/codegen.hpp
@@ -6,8 +6,8 @@
 
 #include <map>
 #include <memory>
+#include <set>
 #include <string>
-#include <utility>
 
 #include "model/model_json.hpp"
 
@@ -17,8 +17,7 @@ class CodeGenerator {
    public:
     CodeGenerator(const PlanJson &plan,
                   const std::map<size_t, size_t> &buffer_id_to_offset,
-                  const std::map<size_t, std::pair<std::string, void *>>
-                      &buffer_id_to_kernel_arg,
+                  const std::set<size_t> &extra_buffer_ids,
                   const std::string &name = "ark_kernel");
 
     ~CodeGenerator() = default;
diff --git a/ark/external_buffer_registry.cpp b/ark/external_buffer_registry.cpp
deleted file mode 100644
index 912050d0d..000000000
--- a/ark/external_buffer_registry.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include "external_buffer_registry.hpp"
-
-#include "logging.hpp"
-
-namespace ark {
-
-ExternalBufferRegistry &ExternalBufferRegistry::get_instance() {
-    static ExternalBufferRegistry instance;
-    return instance;
-}
-
-void ExternalBufferRegistry::set(const size_t id, void *data) {
-    buffers_[id] = data;
-}
-
-void *ExternalBufferRegistry::get(const size_t id) const {
-    auto it = buffers_.find(id);
-    if (it != buffers_.end()) {
-        return it->second;
-    }
-    return nullptr;
-}
-
-void ExternalBufferRegistry::clear() { buffers_.clear(); }
-
-}  // namespace ark
diff --git a/ark/external_buffer_registry.hpp b/ark/external_buffer_registry.hpp
deleted file mode 100644
index ab199bafc..000000000
--- a/ark/external_buffer_registry.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#ifndef ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
-#define ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
-
-#include <unordered_map>
-
-namespace ark {
-// Manages externally allocated buffers (buffers corresponding to Tensors that
-// are the output of a `placeholder` operation) outside of ARK's memory space.
-class ExternalBufferRegistry {
-   public:
-    static ExternalBufferRegistry &get_instance();
-
-    void set(const size_t id, void *data);
-
-    void *get(const size_t id) const;
-
-    void clear();
-
-   private:
-    // Maps buffer IDs to pointers and sizes.
-    std::unordered_map<size_t, void *> buffers_;
-    ExternalBufferRegistry() {}
-    ExternalBufferRegistry(const ExternalBufferRegistry &) = delete;
-    ExternalBufferRegistry &operator=(const ExternalBufferRegistry &) = delete;
-};
-}  // namespace ark
-
-#endif  // ARK_EXTERNAL_BUFFER_REGISTRY_HPP_
diff --git a/ark/gpu/gpu_event.cpp b/ark/gpu/gpu_event.cpp
index 06779b91a..9f91e384d 100644
--- a/ark/gpu/gpu_event.cpp
+++ b/ark/gpu/gpu_event.cpp
@@ -7,21 +7,25 @@
 #include "gpu/gpu_manager.hpp"
 
 namespace ark {
+
 class GpuEvent::Impl {
    public:
-    Impl(bool disable_timing);
+    Impl(int device_id, bool disable_timing);
     ~Impl();
     Impl(const Impl&) = delete;
     Impl& operator=(const Impl&) = delete;
 
+    int device_id() const { return device_id_; }
     void record(gpuStream stream);
     float elapsed_msec(const GpuEvent& other) const;
 
    private:
+    int device_id_;
     gpuEvent event_;
 };
 
-GpuEvent::Impl::Impl(bool disable_timing) {
+GpuEvent::Impl::Impl(int device_id, bool disable_timing)
+    : device_id_(device_id) {
     unsigned int flags = 0;
     if (disable_timing) {
         flags |= gpuEventDisableTiming;
@@ -41,8 +45,10 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
     return elapsed;
 }
 
-GpuEvent::GpuEvent(bool disable_timing)
-    : pimpl_(std::make_shared<Impl>(disable_timing)) {}
+GpuEvent::GpuEvent(int device_id, bool disable_timing)
+    : pimpl_(std::make_shared<Impl>(device_id, disable_timing)) {}
+
+int GpuEvent::device_id() const { return pimpl_->device_id(); }
 
 void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }
 
diff --git a/ark/gpu/gpu_event.hpp b/ark/gpu/gpu_event.hpp
index bd2a7c952..2180f1320 100644
--- a/ark/gpu/gpu_event.hpp
+++ b/ark/gpu/gpu_event.hpp
@@ -19,13 +19,14 @@ class GpuEvent {
     GpuEvent(const GpuEvent &) = delete;
     GpuEvent &operator=(const GpuEvent &) = delete;
 
+    int device_id() const;
     void record(gpuStream stream);
     float elapsed_msec(const GpuEvent &other) const;
 
    protected:
     friend class GpuManager;
 
-    GpuEvent(bool disable_timing = false);
+    GpuEvent(int device_id, bool disable_timing = false);
 
    private:
     class Impl;
diff --git a/ark/gpu/gpu_manager.cpp b/ark/gpu/gpu_manager.cpp
index 2b5be490b..9c49cfbc6 100644
--- a/ark/gpu/gpu_manager.cpp
+++ b/ark/gpu/gpu_manager.cpp
@@ -118,7 +118,8 @@ std::shared_ptr<GpuHostMemory> GpuManager::malloc_host(size_t bytes,
 }
 
 std::shared_ptr<GpuEvent> GpuManager::create_event(bool disable_timing) const {
-    return std::shared_ptr<GpuEvent>(new GpuEvent(disable_timing));
+    return std::shared_ptr<GpuEvent>(
+        new GpuEvent(pimpl_->gpu_id_, disable_timing));
 }
 
 std::shared_ptr<GpuStream> GpuManager::create_stream() const {
diff --git a/ark/gpu/gpu_manager.hpp b/ark/gpu/gpu_manager.hpp
index eeeda4d94..71f47e670 100644
--- a/ark/gpu/gpu_manager.hpp
+++ b/ark/gpu/gpu_manager.hpp
@@ -16,7 +16,7 @@ namespace ark {
 
 class GpuManager {
    public:
-    static std::shared_ptr<GpuManager> get_instance(int gpu_id);
+    static std::shared_ptr<GpuManager> get_instance(int device_id);
 
     GpuManager(const GpuManager &) = delete;
     ~GpuManager() = default;
@@ -54,7 +54,7 @@ class GpuManager {
     };
 
    private:
-    GpuManager(int gpu_id);
+    GpuManager(int device_id);
 
     class Impl;
     std::shared_ptr<Impl> pimpl_;
diff --git a/ark/include/ark/executor.hpp b/ark/include/ark/executor.hpp
index fafc9066c..2e97ffe78 100644
--- a/ark/include/ark/executor.hpp
+++ b/ark/include/ark/executor.hpp
@@ -39,16 +39,17 @@ class Executor {
     /// Return the plan string.
     std::string plan() const;
 
-    const std::string &name() const;
+    /// Return the name of the executor.
+    std::string name() const;
 
     /// Compile the model. This must be called before `launch()`.
     void compile(const std::string &plan, int device_id,
                  const std::string &name = "executor");
 
     /// Launch the executor. This must be called after `compile()`.
-    void launch(
-        Stream stream = nullptr, bool loop_mode = true,
-        const std::unordered_map<Tensor, void *> &placeholder_data = {});
+    void launch(const std::unordered_map<Tensor, void *> &placeholder_data = {},
+                Stream stream = nullptr, bool loop_mode = true,
+                bool record = false);
 
     /// Run the executor for `iter` iterations.
     void run(
@@ -108,11 +109,14 @@ class DefaultExecutor : public Executor {
                     Stream stream = nullptr,
                     const std::vector<Planner::ConfigRule> &config_rules = {},
                     const std::string &name = "DefaultExecutor",
-                    bool loop_mode = true);
+                    bool loop_mode = true, bool record = false);
 
     /// Launch the default executor.
     void launch(
         const std::unordered_map<Tensor, void *> &placeholder_data = {});
+
+   private:
+    bool record_;
 };
 
 }  // namespace ark
diff --git a/ark/include/ark/tensor.hpp b/ark/include/ark/tensor.hpp
index c2d9dbe94..aa8dcaa68 100644
--- a/ark/include/ark/tensor.hpp
+++ b/ark/include/ark/tensor.hpp
@@ -52,6 +52,12 @@ class Tensor {
     const DataType &data_type() const;
 
     Dims torch_strides() const;
+
+    void *data() const;
+
+    void *data(void *data);
+
+    bool is_external() const;
 };
 
 const Tensor NullTensor;
diff --git a/ark/include/kernels/gemm_cutlass.h b/ark/include/kernels/gemm_cutlass.h
index 80d377290..c5e8c7579 100644
--- a/ark/include/kernels/gemm_cutlass.h
+++ b/ark/include/kernels/gemm_cutlass.h
@@ -45,754 +45,53 @@ struct GemmThreadblockSwizzle {
     }
 };
 
-template <typename UnitOp, typename OperatorClass, typename ArchTag,
-          typename ElementA, typename LayoutA, typename ElementB,
-          typename LayoutB, typename ElementC, typename LayoutC, typename Shape>
-struct GemmConfiguration;
-
-////////////////////////////////////////////////////////////////////////////////
-/// SM70 FP16
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm70, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 256, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>, cutlass::gemm::GemmShape<8, 8, 4>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 2>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// SM80 FP16
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 256, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 256, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<256, 128, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<256, 128, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 128, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 128, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<256, 64, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<256, 64, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 256, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 256, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 128, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 128, 64>,
-        cutlass::gemm::GemmShape<32, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 64, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 64, 64>,
-        cutlass::gemm::GemmShape<64, 32, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 64, 64>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<32, 32, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 256, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<256, 128, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<256, 128, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 128, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 128, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<256, 64, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<256, 64, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 256, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 128, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 128, 32>,
-        cutlass::gemm::GemmShape<32, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<128, 64, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<128, 64, 32>,
-        cutlass::gemm::GemmShape<64, 32, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::half_t, LayoutA,
-                         cutlass::half_t, LayoutB, cutlass::half_t, LayoutC,
-                         cutlass::gemm::GemmShape<64, 64, 32>> {
-    using ElementOutput = cutlass::half_t;
-    using ElementAccumulator = cutlass::half_t;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<32, 32, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 10>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// SM80 BF16
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 256, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 256, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<256, 128, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<256, 128, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 128, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 128, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<256, 64, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<256, 64, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 256, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 256, 64>,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 128, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 128, 64>,
-        cutlass::gemm::GemmShape<32, 64, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 64, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 64, 64>,
-        cutlass::gemm::GemmShape<64, 32, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 64, 64>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 64, 64>,
-        cutlass::gemm::GemmShape<32, 32, 64>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 256, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<256, 128, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
+template <typename ArchTag, typename ElementAccumulator, typename WarpShape>
+struct InstructionShape;
 
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<256, 128, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
+template <typename ElementAccumulator, typename WarpShape>
+struct InstructionShape<cutlass::arch::Sm70, ElementAccumulator, WarpShape> {
+    using value = cutlass::gemm::GemmShape<8, 8, 4>;
 };
 
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 128, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 128, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
+template <typename ElementAccumulator, typename WarpShape>
+struct InstructionShape<cutlass::arch::Sm80, ElementAccumulator, WarpShape> {
+    static constexpr int K = std::is_same_v<ElementAccumulator, float> ? 8 : 16;
+    using value = cutlass::gemm::GemmShape<16, 8, K>;
 };
 
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<256, 64, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<256, 64, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 256, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 4>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 128, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 128, 32>,
-        cutlass::gemm::GemmShape<32, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<128, 64, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::bfloat16_t, LayoutA, cutlass::bfloat16_t, LayoutB,
-        ElementOutput, LayoutC, ElementAccumulator,
-        cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 64, 32>,
-        cutlass::gemm::GemmShape<64, 32, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 6>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, cutlass::bfloat16_t, LayoutA,
-                         cutlass::bfloat16_t, LayoutB, cutlass::bfloat16_t,
-                         LayoutC, cutlass::gemm::GemmShape<64, 64, 32>> {
-    using ElementOutput = cutlass::bfloat16_t;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        cutlass::half_t, LayoutA, cutlass::half_t, LayoutB, ElementOutput,
-        LayoutC, ElementAccumulator, cutlass::arch::OpClassTensorOp,
-        cutlass::arch::Sm80, cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<32, 32, 32>,
-        cutlass::gemm::GemmShape<16, 8, 16>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 10>;
-};
-
-////////////////////////////////////////////////////////////////////////////////
-/// SM80 FP32
-////////////////////////////////////////////////////////////////////////////////
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<
-    UnitOp, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, float, LayoutA,
-    float, LayoutB, float, LayoutC, cutlass::gemm::GemmShape<128, 256, 32>> {
-    using ElementOutput = float;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        float, LayoutA, float, LayoutB, ElementOutput, LayoutC,
-        ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 256, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 8>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<
-    UnitOp, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80, float, LayoutA,
-    float, LayoutB, float, LayoutC, cutlass::gemm::GemmShape<128, 128, 32>> {
-    using ElementOutput = float;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        float, LayoutA, float, LayoutB, ElementOutput, LayoutC,
-        ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<128, 128, 32>,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<16, 8, 8>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
-            ElementAccumulator, ElementAccumulator>,
-        ark::GemmThreadblockSwizzle<UnitOp>, 3>;
-};
-
-template <typename UnitOp, typename LayoutA, typename LayoutB, typename LayoutC>
-struct GemmConfiguration<UnitOp, cutlass::arch::OpClassTensorOp,
-                         cutlass::arch::Sm80, float, LayoutA, float, LayoutB,
-                         float, LayoutC, cutlass::gemm::GemmShape<64, 64, 32>> {
-    using ElementOutput = float;
-    using ElementAccumulator = float;
-
-    using Gemm = cutlass::gemm::device::Gemm<
-        float, LayoutA, float, LayoutB, ElementOutput, LayoutC,
-        ElementAccumulator, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm80,
-        cutlass::gemm::GemmShape<64, 64, 32>,
-        cutlass::gemm::GemmShape<32, 32, 32>,
-        cutlass::gemm::GemmShape<16, 8, 8>,
-        cutlass::epilogue::thread::LinearCombination<
-            ElementOutput, 128 / cutlass::sizeof_bits<ElementOutput>::value,
+template <typename UnitOp, typename OperatorClass, typename ArchTag,
+          typename ElementA, typename LayoutA, typename ElementB,
+          typename LayoutB, typename ElementC, typename LayoutC, typename Shape>
+struct GemmConfiguration {
+    // Supports float, half, and bfloat16.
+    static_assert(std::is_same_v<ElementA, float> ||
+                      std::is_same_v<ElementA, cutlass::half_t> ||
+                      std::is_same_v<ElementA, cutlass::bfloat16_t>,
+                  "ElementA must be float, half, or bfloat16");
+    static_assert(std::is_same_v<ElementB, float> ||
+                      std::is_same_v<ElementB, cutlass::half_t> ||
+                      std::is_same_v<ElementB, cutlass::bfloat16_t>,
+                  "ElementB must be float, half, or bfloat16");
+    static_assert(std::is_same_v<ElementC, float> ||
+                      std::is_same_v<ElementC, cutlass::half_t> ||
+                      std::is_same_v<ElementC, cutlass::bfloat16_t>,
+                  "ElementC must be float, half, or bfloat16");
+    using ElementAccumulator = typename std::conditional_t<
+        std::is_same_v<ElementC, cutlass::bfloat16_t>, float, ElementC>;
+    static constexpr int NumWarps = UnitOp::NumWarps;
+    static constexpr int NumWarpsN =
+        1 << math::div_up<math::log2_up<NumWarps>::value, 2>::value;
+    static constexpr int NumWarpsM = NumWarps / NumWarpsN;
+    using WarpShape =
+        cutlass::gemm::GemmShape<Shape::kM / NumWarpsM, Shape::kN / NumWarpsN,
+                                 Shape::kK>;
+    using InstShape = typename InstructionShape<ArchTag, ElementAccumulator,
+                                                WarpShape>::value;
+    using Gemm = cutlass::gemm::device::Gemm<
+        ElementA, LayoutA, ElementB, LayoutB, ElementC, LayoutC,
+        ElementAccumulator, OperatorClass, ArchTag, Shape, WarpShape, InstShape,
+        cutlass::epilogue::thread::LinearCombination<
+            ElementC, 128 / cutlass::sizeof_bits<ElementC>::value,
             ElementAccumulator, ElementAccumulator>,
         ark::GemmThreadblockSwizzle<UnitOp>, 3>;
 };
diff --git a/ark/model/model_buffer.cpp b/ark/model/model_buffer.cpp
index 5e2409537..a54b6e81f 100644
--- a/ark/model/model_buffer.cpp
+++ b/ark/model/model_buffer.cpp
@@ -3,7 +3,7 @@
 
 #include "model_buffer.hpp"
 
-#include "external_buffer_registry.hpp"
+#include "buffer_registry.hpp"
 #include "logging.hpp"
 
 namespace ark {
@@ -19,6 +19,9 @@ ModelBuffer::ModelBuffer(size_t id, int rank, bool is_external,
                          const std::vector<TagInfo> &send_tags,
                          const std::vector<TagInfo> &recv_tags)
     : id_(id), rank_(rank), is_external_(is_external) {
+    if (is_external && (!send_tags.empty() || !recv_tags.empty())) {
+        ERR(ModelError, "External buffer cannot have send or receive tags");
+    }
     for (const auto &info : send_tags) {
         send_tags_.insert(info);
     }
@@ -35,6 +38,22 @@ void ModelBuffer::tag_recv(int remote_rank, int tag) {
     recv_tags_.insert(TagInfo{remote_rank, tag});
 }
 
+void *ModelBuffer::data() const {
+    auto info = BufferRegistry::get_instance().get(id_);
+    if (info) {
+        return info->data;
+    }
+    return nullptr;
+}
+
+void *ModelBuffer::data(void *data) {
+    if (is_external_) {
+        BufferRegistry::get_instance().set(id_, data, -1, true);
+        return data;
+    }
+    return nullptr;
+}
+
 Json ModelBuffer::serialize() const {
     Json j;
     j["Id"] = id_;
diff --git a/ark/model/model_buffer.hpp b/ark/model/model_buffer.hpp
index 8b66356b1..d52f2bf26 100644
--- a/ark/model/model_buffer.hpp
+++ b/ark/model/model_buffer.hpp
@@ -43,6 +43,14 @@ class ModelBuffer {
     // but the same tag can only be used for one receiving buffer.
     void tag_recv(int remote_rank, int tag);
 
+    // Return the underlying data pointer if this buffer is allocated.
+    // Otherwise, return nullptr.
+    void *data() const;
+
+    // Set the underlying data pointer if this buffer is externally managed.
+    // Return the input data pointer. Otherwise, return nullptr.
+    void *data(void *data);
+
     Json serialize() const;
 
     static std::shared_ptr<ModelBuffer> deserialize(const Json &serialized);
diff --git a/ark/model/model_tensor.cpp b/ark/model/model_tensor.cpp
index 713fbf62c..068783045 100644
--- a/ark/model/model_tensor.cpp
+++ b/ark/model/model_tensor.cpp
@@ -92,6 +92,16 @@ size_t ModelTensor::shape_bytes() const {
     return shape_.nelems() * data_type_->bytes();
 }
 
+void *ModelTensor::data() const {
+    return buffer_->data();
+}
+
+void *ModelTensor::data(void *data) {
+    return buffer_->data(data);
+}
+
+bool ModelTensor::is_external() const { return buffer_->is_external(); }
+
 Json ModelTensor::serialize() const {
     Json j;
     j["Id"] = id_;
diff --git a/ark/model/model_tensor.hpp b/ark/model/model_tensor.hpp
index 7c7afac2c..8c892f2b4 100644
--- a/ark/model/model_tensor.hpp
+++ b/ark/model/model_tensor.hpp
@@ -37,6 +37,12 @@ class ModelTensor {
 
     size_t shape_bytes() const;
 
+    void *data() const;
+
+    void *data(void *data);
+
+    bool is_external() const;
+
     Json serialize() const;
 
     static std::shared_ptr<ModelTensor> deserialize(const Json &serialized);
diff --git a/ark/ops/ops_communication_test.cpp b/ark/ops/ops_communication_test.cpp
index 39c466909..de7c42833 100644
--- a/ark/ops/ops_communication_test.cpp
+++ b/ark/ops/ops_communication_test.cpp
@@ -59,8 +59,8 @@ ark::unittest::State test_communication_send_recv_unidir() {
             ark::Model model(gpu_id, 2);
             ark::Tensor tns = model.tensor({1024}, ark::FP16);
             if (gpu_id == 1) {
-                tns = model.send(tns, 0, 0);
-                model.send_done(tns);
+                auto out_tns = model.send(tns, 0, 0);
+                model.send_done(out_tns);
             }
             if (gpu_id == 0) {
                 tns = model.recv(tns, 1, 0);
diff --git a/ark/ops/ops_placeholder.cpp b/ark/ops/ops_placeholder.cpp
index 73c1c1b25..b654aac39 100644
--- a/ark/ops/ops_placeholder.cpp
+++ b/ark/ops/ops_placeholder.cpp
@@ -3,7 +3,7 @@
 
 #include "ops_placeholder.hpp"
 
-#include "external_buffer_registry.hpp"
+#include "buffer_registry.hpp"
 #include "logging.hpp"
 #include "ops_common.hpp"
 
@@ -18,7 +18,7 @@ ModelOpPlaceholder::ModelOpPlaceholder(ModelBufferRef buffer, const Dims &shape,
         buffer = std::make_shared<ModelBuffer>(-1, true);
     }
 
-    ExternalBufferRegistry::get_instance().set(buffer->id(), data);
+    BufferRegistry::get_instance().set(buffer->id(), data, -1, true);
 
     ModelTensorRef tensor = std::make_shared<ModelTensor>(
         data_type, buffer, shape, strides, offsets, padded_shape);
diff --git a/examples/ffn/Makefile b/examples/ffn/Makefile
deleted file mode 100644
index 996f8a187..000000000
--- a/examples/ffn/Makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-ARK_ROOT ?= /usr/local/ark
-CUDIR    ?= /usr/local/cuda
-
-CXX      := g++
-CXXFLAGS := -std=c++17 -Wall -Wextra
-INCLUDE  := -I$(ARK_ROOT)/include -I $(CUDIR)/include -I$(ARK_ROOT)/include/kernels
-LDFLAGS  := -L$(CUDIR)/lib64/stubs -Wl,-rpath,$(CUDIR)/lib64
-LDLIBS   := -lcuda -lnvidia-ml -lnvrtc -lpthread -lrt -libverbs -lnuma
-
-all: build/ffn
-	
-build/ffn: build/ffn.o
-	$(CXX) -o $@ $< -L$(ARK_ROOT)/lib -lark $(LDFLAGS) $(LDLIBS)
-
-build/ffn.o: ffn.cc
-	mkdir -p $(@D)
-	$(CXX) -o $@ $(CXXFLAGS) $(INCLUDE) -c $<
-
-clean:
-	rm -r build/
diff --git a/examples/ffn/ffn.cc b/examples/ffn/ffn.cc
deleted file mode 100644
index 6eee77a7d..000000000
--- a/examples/ffn/ffn.cc
+++ /dev/null
@@ -1,450 +0,0 @@
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT license.
-
-#include <sys/types.h>
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include <cassert>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <vector>
-
-#include "ark.h"
-#include "ark_utils.h"
-
-using namespace std;
-using namespace ark;
-
-void print_tensor(Tensor *tensor, Executor *exe) {
-    if (tensor == nullptr) {
-        return;
-    }
-    cout << "tensor: " << tensor->name << endl;
-    size_t tensor_size = tensor->shape_bytes();
-    half_t *data = (half_t *)malloc(tensor_size);
-    exe->tensor_memcpy(data, tensor, tensor_size);
-    for (int i = 0; i < tensor->size(); ++i) {
-        cout << data[i] << " ";
-    }
-    cout << endl;
-    delete[] data;
-}
-
-class FullyConnectedLayer {
-   public:
-    FullyConnectedLayer(int dim_input, int dim_output, TensorType dtype,
-                        Model &model)
-        : model{model} {
-        Tensor *weight = model.tensor({dim_input, dim_output}, dtype);
-        Tensor *bias = model.tensor({1, dim_output}, dtype);
-        params = {weight, bias};
-    }
-
-    Tensor *forward(Tensor *input) {
-        this->input = input;
-        Tensor *weight = params[0];
-        Tensor *output1 = model.matmul(input, weight);
-        Tensor *bias = params[1];
-        Tensor *output2 = model.add(output1, bias);
-        return output2;
-    }
-
-    Tensor *backward(Tensor *grad) {
-        Tensor *weight = params[0];
-        Tensor *bias = params[1];
-        Tensor *grad_output2 = grad;
-        Tensor *grad_bias = model.tensor(bias->shape, bias->type);
-        grad_bias = model.scale(grad_output2, 1, grad_bias);
-        Tensor *grad_output1 = grad_output2;
-        Tensor *grad_input = model.tensor(input->shape, input->type);
-        Tensor *grad_weight = model.tensor(weight->shape, weight->type);
-        grad_input =
-            model.matmul(grad_output1, weight, nullptr, 1, false, true);
-        grad_weight =
-            model.matmul(input, grad_output1, nullptr, 1, true, false);
-        grads[weight] = grad_weight;
-        grads[bias] = grad_bias;
-        return grad_input;
-    }
-
-    void apply_grads() {
-        for (auto &param : params) {
-            Tensor *grad = grads[param];
-            // the learning rate
-            Tensor *grad_scale = model.scale(grad, -0.0001);
-            Tensor *param_identity = model.identity(param);
-            model.add(param, grad_scale, param_identity);
-        }
-    }
-
-    void print_tensors(Executor *exe) {
-        print_tensor(input, exe);
-        // print the parameters.
-        for (size_t i = 0; i < params.size(); ++i) {
-            print_tensor(params[i], exe);
-        }
-    }
-
-    Tensor *input;
-    vector<Tensor *> params;
-    map<Tensor *, Tensor *> grads;
-    Model &model;
-};
-
-class FFN_Model {
-   public:
-    //
-    FFN_Model(int dim_model, TensorType dtype, Model &model, int layer_num,
-              int num_gpus, int gpu_id)
-        : model{model}, num_gpus{num_gpus}, gpu_id{gpu_id} {
-        for (int i = 0; i < layer_num; ++i) {
-            FullyConnectedLayer layer{dim_model, dim_model, dtype, model};
-            layers.push_back(layer);
-        }
-    }
-
-    Model &get_model() { return model; }
-
-    //
-    Tensor *forward(Tensor *input = nullptr) {
-        for (size_t i = 0; i < layers.size(); ++i) {
-            printf("forward layer: %d\n", i);
-            input = layers[i].forward(input);
-        }
-        return input;
-    }
-
-    //
-    void backward(Tensor *grad) {
-        for (int i = layers.size() - 1; i >= 0; --i) {
-            printf("backward layer: %d\n", i);
-            grad = layers[i].backward(grad);
-        }
-        DimType grads_size = 0;
-        vector<Tensor *> grads;
-
-        for (auto &layer : layers) {
-            for (auto &param : layer.params) {
-                grads.push_back(layer.grads[param]);
-                grads_size += layer.grads[param]->size();
-            }
-        }
-
-        // All-reduce gradients
-        if (num_gpus > 1) {
-            Tensor *gradients = model.tensor({1, grads_size, 1, 1}, FP16);
-            Tensor *idn = model.identity(gradients, {grads});
-
-            model.all_reduce(idn, gpu_id, num_gpus);
-        }
-    }
-
-    void print_tensors(Executor *exe) {
-        for (size_t i = 0; i < layers.size(); ++i) {
-            printf("layer: %d\n", i);
-            layers[i].print_tensors(exe);
-        }
-    }
-
-    Model &model;
-    // model parameters.
-    vector<FullyConnectedLayer> layers;
-    Tensor *model_input;
-    int num_gpus;
-    int gpu_id;
-};
-
-class LossFn {
-   public:
-    LossFn(Model &model) : model{model} {}
-
-    Tensor *forward(Tensor *output, Tensor *ground_truth) {
-        this->output = output;
-        printf("loss forward");
-        neg_ground_truth =
-            model.tensor(ground_truth->shape, ground_truth->type);
-        neg_ground_truth = model.scale(ground_truth, -1, neg_ground_truth);
-        diff = model.tensor(output->shape, output->type);
-        model.add(output, neg_ground_truth, diff);
-        diff1 = model.tensor(diff->shape, diff->type);
-        model.scale(diff, 1, diff1);
-        loss_tensor = model.tensor(diff->shape, diff->type);
-        model.mul(diff, diff1, loss_tensor);
-        return loss_tensor;
-    }
-
-    Tensor *backward(Tensor *loss_tensor) {
-        printf("loss backward");
-        grad_diff = model.tensor(diff->shape, diff->type);
-        model.mul(loss_tensor, diff, grad_diff);
-        return grad_diff;
-    }
-
-    void print_tensors(Executor *exe) {
-        printf("loss_fn.output: ");
-        print_tensor(this->output, exe);
-        printf("loss_fn.neg_ground_truth: ");
-        print_tensor(this->neg_ground_truth, exe);
-        printf("loss_fn.diff: ");
-        print_tensor(this->diff, exe);
-        printf("loss_fn.diff1: ");
-        print_tensor(this->diff1, exe);
-        printf("loss_fn.neg_ground_truth: ");
-        print_tensor(this->neg_ground_truth, exe);
-        printf("loss_fn.loss_tensor: ");
-        print_tensor(this->loss_tensor, exe);
-        printf("loss_fn.grad_diff: ");
-        print_tensor(this->grad_diff, exe);
-    }
-    Tensor *output;
-    Tensor *loss_tensor;
-    Tensor *neg_ground_truth;
-    Tensor *diff;
-    Tensor *diff1;
-    Tensor *grad_diff;
-    Model &model;
-};
-
-class Trainer {
-   public:
-    Trainer(Model &model, int dim_input, int batch_size, int gpu_id,
-            int num_gpus)
-        : model{model},
-          ffn_model{dim_input, FP16, model, 2, num_gpus, gpu_id},
-          loss_fn{model},
-          batch_size{batch_size},
-          num_gpus{num_gpus},
-          gpu_id{gpu_id} {
-        input = model.tensor({batch_size, dim_input}, FP16);
-        ground_truth = model.tensor({batch_size, dim_input}, FP16);
-        output = ffn_model.forward(input);
-        loss_tensor = loss_fn.forward(output, ground_truth);
-        grad_loss = model.tensor(loss_tensor->shape, loss_tensor->type);
-        grad_output = loss_fn.backward(grad_loss);
-        ffn_model.backward(grad_output);
-        apply_grad();
-
-        exe = new Executor(gpu_id, gpu_id, (int)num_gpus, model,
-                           "sampleFFN_Model");
-        exe->compile();
-    }
-
-    void init_data() {
-        // init the input and ground_truth.
-        auto data_input =
-            ark::utils::range_halfs(this->input->shape_bytes(), 1, 0);
-        exe->tensor_memcpy(this->input, data_input.get(),
-                           this->input->shape_bytes());
-        auto data_ground_truth =
-            ark::utils::range_halfs(this->ground_truth->shape_bytes(), 2, 0);
-        exe->tensor_memcpy(this->ground_truth, data_ground_truth.get(),
-                           this->ground_truth->shape_bytes());
-        // init the grad_loss with 1.
-        auto data_grad_loss =
-            ark::utils::range_halfs(this->grad_loss->shape_bytes(), 1, 0);
-        exe->tensor_memcpy(this->grad_loss, data_grad_loss.get(),
-                           this->grad_loss->shape_bytes());
-        // init all the parameters of the model with random values.
-        for (auto &layer : ffn_model.layers) {
-            for (auto &param : layer.params) {
-                auto data = ark::utils::rand_halfs(param->shape_bytes(), 1);
-                exe->tensor_memcpy(param, data.get(), param->shape_bytes());
-            }
-        }
-    }
-
-    void train(int iter, int print_interval = 1) {
-        exe->launch();
-        if (print_interval == 0) {
-            // don't print the loss for debug.
-            exe->run(iter);
-        } else {
-            // we only print the loss every print_interval iterations for debug.
-            for (int i = 0; i < iter; ++i) {
-                exe->run(1);
-                exe->wait();
-                if (i % print_interval == 0) {
-                    float loss = get_loss();
-                    cout << "iter: " << i << ", loss: " << loss << endl;
-                }
-            }
-        }
-        float elapsed_msec = exe->stop();
-        cout << "Elapsed: " << elapsed_msec / iter << " ms/iter\n";
-    }
-
-    float get_loss() {
-        size_t tensor_size = this->loss_tensor->shape_bytes();
-        half_t *loss = (half_t *)malloc(tensor_size);
-        exe->tensor_memcpy(loss, this->loss_tensor, tensor_size);
-        float loss_sum = 0;
-        for (int i = 0; i < this->loss_tensor->size(); ++i) {
-            loss_sum += (float)loss[i];
-        }
-        delete[] loss;
-        return loss_sum;
-    }
-
-    void apply_grad() {
-        for (auto &layer : ffn_model.layers) {
-            layer.apply_grads();
-        }
-    }
-
-    void print_tensors(Executor *exe) {
-        printf("loss_tensor: ");
-        print_tensor(this->loss_tensor, exe);
-        printf("input: ");
-        print_tensor(this->input, exe);
-        printf("output: ");
-        print_tensor(this->output, exe);
-        printf("ground_truth: ");
-        print_tensor(this->ground_truth, exe);
-        printf("ffn_model: ");
-        this->ffn_model.print_tensors(exe);
-        printf("loss_fn: ");
-        this->loss_fn.print_tensors(exe);
-    }
-
-    Model &model;
-    Tensor *loss_tensor, *input, *ground_truth, *output;
-    Tensor *grad_output;
-    Tensor *grad_loss;
-    FFN_Model ffn_model;
-    LossFn loss_fn;
-    Executor *exe;
-    int batch_size;
-    int num_gpus;
-    int gpu_id;
-};
-
-struct Args {
-    int batch_size;
-    int dims;
-    int num_gpus;
-    int iterations;
-    int print_interval;
-    int seed;
-    bool verbose;
-};
-
-Args parse_args(int argc, const char **argv) {
-    string prog = argv[0];
-    vector<string> args(argv + 1, argv + argc);
-
-    auto print_help = [&prog]() {
-        cerr << "Usage: " << prog << " [options]\n"
-             << "Options:\n"
-             << "  -h, --help\t\t\tPrint this help message\n"
-             << "  -b, --batch-size <int>\t\tBatch size\n"
-             << "  -d, --dims <int>\t\tDimensions\n"
-             << "  -g, --num-gpus <int>\t\tNumber of GPUs\n"
-             << "  -i, --iter <int>\t\tNumber of iterations\n"
-             << "  -p, --print-interval <int>\tPrint interval\n"
-             << "  -s, --seed <int>\t\tRandom seed\n"
-             << "  -v, --verbose\t\t\tVerbose output\n";
-        exit(0);
-    };
-
-    Args ret;
-
-    // Default arguments
-    ret.batch_size = 1;
-    ret.dims = 64;
-    ret.num_gpus = 1;
-    ret.iterations = 10;
-    ret.print_interval = 1;
-    ret.seed = -1;
-    ret.verbose = false;
-
-    for (auto it = args.begin(); it != args.end(); ++it) {
-        if (*it == "-h" || *it == "--help") {
-            print_help();
-        } else if (*it == "-b" || *it == "--batch-size") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.batch_size = stoi(*it);
-        } else if (*it == "-d" || *it == "--dims") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.dims = stoi(*it);
-        } else if (*it == "-g" || *it == "--num-gpus") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.num_gpus = stoi(*it);
-        } else if (*it == "-i" || *it == "--iter") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.iterations = stoi(*it);
-        } else if (*it == "-p" || *it == "--print-interval") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.print_interval = stoi(*it);
-        } else if (*it == "-s" || *it == "--seed") {
-            if (++it == args.end()) {
-                cerr << "Error: missing argument for " << *(it - 1) << endl;
-                exit(1);
-            }
-            ret.seed = stoi(*it);
-        } else if (*it == "-v" || *it == "--verbose") {
-            ret.verbose = true;
-        } else {
-            cerr << "Error: unknown option " << *it << endl;
-            print_help();
-        }
-    }
-
-    return ret;
-}
-
-int main(int argc, const char **argv) {
-    Args args = parse_args(argc, argv);
-
-    cout << "--" << endl
-         << "batch_size=" << args.batch_size << endl
-         << "dims=" << args.dims << endl
-         << "num_gpus=" << args.num_gpus << endl
-         << "iterations=" << args.iterations << endl
-         << "print_interval=" << args.print_interval << endl
-         << "seed=" << args.seed << endl
-         << "verbose=" << args.verbose << endl
-         << "--" << endl;
-
-    vector<int> pids;
-    for (int gpu_id = 0; gpu_id < args.num_gpus; ++gpu_id) {
-        pids.emplace_back(ark::utils::proc_spawn([&] {
-            ark::srand(args.seed);
-
-            Model model{gpu_id};
-            Trainer trainer{model, args.dims, args.batch_size, gpu_id,
-                            args.num_gpus};
-            trainer.init_data();
-            // train the model.
-            trainer.train(args.iterations, args.print_interval);
-            // trainer.print_tensors(trainer.exe);
-            return 0;
-        }));
-    }
-    int state = 0;
-    for (auto pid : pids) {
-        int ret = ark::utils::proc_wait(pid);
-        if (ret != 0) {
-            cerr << "E: Process " << pid << " returned " << ret << endl;
-            state = 1;
-        }
-    }
-    return state;
-}
diff --git a/examples/ffn/main.py b/examples/ffn/main.py
new file mode 100644
index 000000000..263228d95
--- /dev/null
+++ b/examples/ffn/main.py
@@ -0,0 +1,73 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import torch
+import torch.nn as nn
+from ark.torch.tracer import tracer as ark_torch_tracer
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
+        self.w2 = nn.Linear(hidden_dim, dim, bias=True)
+        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
+
+
+@ark_torch_tracer
+class ForwardPass(nn.Module):
+    def __init__(self, dim: int, hidden_dim: int):
+        super().__init__()
+        self.ff = FeedForward(dim, hidden_dim)
+
+    def forward(
+        self, input: torch.Tensor, target: torch.Tensor
+    ) -> torch.Tensor:
+        t = self.ff(input)
+        return nn.functional.mse_loss(t, target)
+
+
+def main():
+    batch_size = 128
+    num_batches = 1
+    dim = 1024
+    hidden_dim = 4096
+    num_epochs = 10
+    torch.manual_seed(42)
+    torch.set_default_device("cuda:0")
+
+    model = ForwardPass(dim=dim, hidden_dim=hidden_dim)
+
+    inputs = [torch.randn(batch_size, dim) for _ in range(num_batches)]
+    targets = [torch.randn(batch_size, dim) for _ in range(num_batches)]
+
+    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
+
+    for epoch in range(1):
+        optimizer.zero_grad()
+        avg_loss = 0
+        for input, target in zip(inputs, targets):
+            loss = model(input, target)
+            avg_loss += loss.detach().item()
+            loss.backward()
+            optimizer.step()
+        avg_loss /= num_batches
+        print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")
+
+    for epoch in range(num_epochs):
+        optimizer.zero_grad()
+        avg_loss = 0
+        for input, target in zip(inputs, targets):
+            loss = model.forward_ark(input, target)
+            avg_loss += loss.to_numpy()[0]
+            model.backward_ark(loss)
+            optimizer.step()
+        avg_loss /= num_batches
+        print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 50f800b10..222f09f3f 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -76,12 +76,12 @@ def copy(
     output: Tensor = NullTensor,
     name: str = "copy",
 ) -> Tensor:
-    """Data caopy."""
+    """Data copy."""
     if output is not NullTensor:
         output = output._tensor
     if isinstance(input, Tensor):
-        intput = intput._tensor
-    return Tensor(Model.get_model().copy(intput, output, name))
+        input = input._tensor
+    return Tensor(Model.get_model().copy(input, output, name))
 
 
 def div(
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 6f20516a8..f6b770710 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -109,7 +109,7 @@ def launch(
             or device_id != self.executor.device_id()
         ):
             self.executor.compile(plan_str, device_id)
-        self.executor.launch(stream, loop_mode, tensor_mappings)
+        self.executor.launch(tensor_mappings, stream, loop_mode)
         self.state = Runtime.State.LaunchedNotRunning
         Runtime._loop_mode = loop_mode
 
@@ -127,14 +127,11 @@ def run(self, iter=1, non_blocking=False, tensor_mappings={}):
             logging.error(f"ARK runtime is not launched")
             raise RuntimeError(f"ARK runtime is not launched")
         self.state = Runtime.State.Running
+        ph_map = {}
         for ark_tensor in list(tensor_mappings.keys()):
-            torch_tensor = tensor_mappings[ark_tensor]
-            if not isinstance(torch_tensor, torch.Tensor):
-                raise ValueError("Must bind PyTorch tensor")
-            internal_ark_tensor = ark_tensor._tensor
-            tensor_mappings[internal_ark_tensor] = torch_tensor.data_ptr()
-            del tensor_mappings[ark_tensor]
-        self.executor.run(iter, tensor_mappings)
+            t = tensor_mappings[ark_tensor]
+            ph_map[ark_tensor._tensor] = t.data_ptr()
+        self.executor.run(iter, ph_map)
         if not non_blocking:
             self.wait()
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 5fa361bef..a09b0af65 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -89,6 +89,23 @@ def dtype(self) -> DataType:
         """
         return DataType.from_ctype(self._tensor.data_type())
 
+    def data_ptr(self) -> int:
+        """
+        Returns the underlying data pointer.
+        """
+        rt = Runtime.get_runtime()
+        if not rt.launched():
+            raise RuntimeError(
+                "`Tensor.data_ptr()` is usable only after you call `Runtime.launch()`."
+            )
+        return rt.executor.tensor_address(self._tensor)
+
+    def is_external(self) -> bool:
+        """
+        Returns true if the tensor's data is not managed by ARK.
+        """
+        return self._tensor.is_external()
+
     def to_numpy(
         self, ndarray: np.ndarray = None, stream: int = 0
     ) -> np.ndarray:
diff --git a/python/ark/torch/tracer.py b/python/ark/torch/tracer.py
new file mode 100644
index 000000000..9570fd977
--- /dev/null
+++ b/python/ark/torch/tracer.py
@@ -0,0 +1,355 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+try:
+    import torch
+except ImportError:
+    raise ImportError("torch is required to use this module")
+
+import logging
+from typing import Type, List, Dict, Optional, Callable, Any
+
+from ..planner import Planner, Plan
+from ..tensor import Tensor
+from ..runtime import Runtime
+from ..model import Model
+from .. import ops
+
+
+__all__ = ["tracer"]
+
+
+def handle_aten_add_scalar(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Element-wise addition with a scalar"""
+    t = tensors[node.args[0].name]
+    value = node.args[1]
+    return ops.add(t, value, name=node.name)
+
+
+def handle_aten_sub_tensor(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Element-wise subtraction"""
+    t1 = tensors[node.args[0].name]
+    t2 = tensors[node.args[1].name]
+    return ops.sub(t1, t2, name=node.name)
+
+
+def handle_aten_mul_tensor(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Element-wise multiplication"""
+    t1 = tensors[node.args[0].name]
+    t2 = tensors[node.args[1].name]
+    return ops.mul(t1, t2, name=node.name)
+
+
+def handle_aten_t(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Transpose"""
+    t = tensors[node.args[0].name]
+    perm = list(range(len(t.shape())))
+    if len(perm) < 2:
+        raise ValueError(f"Expected at least 2 dimensions, got {len(perm)}")
+    perm[-2], perm[-1] = perm[-1], perm[-2]
+    return ops.transpose(t, perm=perm, name=node.name)
+
+
+def handle_aten_mm(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Matrix multiplication"""
+    input = tensors[node.args[0].name]
+    weight = tensors[node.args[1].name]
+    return ops.matmul(input, weight, name=node.name)
+
+
+def handle_aten_addmm(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Matrix multiplication followed by addition"""
+    bias = tensors[node.args[0].name]
+    input = tensors[node.args[1].name]
+    weight = tensors[node.args[2].name]
+    t = ops.matmul(input, weight)
+    t = ops.add(t, bias, name=node.name)
+    return t
+
+
+def handle_aten_silu(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Sigmoid Linear Unit"""
+    t = tensors[node.args[0].name]
+    return ops.mul(t, ops.sigmoid(t), name=node.name)
+
+
+def handle_aten_sum_dim_intlist(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Sum with specified dimensions"""
+    if len(node.args[1]) != 1:
+        raise NotImplementedError("Multiple dimensions are not supported")
+    t = tensors[node.args[0].name]
+    axis = node.args[1][0]
+    keepdims = node.args[2]
+    return ops.reduce_sum(t, axis=axis, keepdims=keepdims, name=node.name)
+
+
+def handle_aten_view(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Reshape"""
+    t = tensors[node.args[0].name]
+    shape = node.args[1]
+    return ops.reshape(t, shape, name=node.name)
+
+
+def handle_aten_sigmoid(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Sigmoid"""
+    t = tensors[node.args[0].name]
+    return ops.sigmoid(t, name=node.name)
+
+
+def handle_aten_empty_like(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Create an empty tensor with the same shape"""
+    t = tensors[node.args[0].name]
+    new_t = ops.tensor(t.shape(), dtype=t.dtype())
+    new_t = ops.identity(new_t, deps=[t], name=node.name)
+    return new_t
+
+
+def handle_aten_fill_scalar(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Fill a tensor with a scalar value"""
+    t = tensors[node.args[0].name]
+    value = node.args[1]
+    return ops.copy(value, t, name=node.name)
+
+
+def handle_aten_mse_loss(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Mean Squared Error loss"""
+    input = tensors[node.args[0].name]
+    target = tensors[node.args[1].name]
+    t = ops.sub(input, target)
+    t = ops.mul(t, t)
+    t = ops.reshape(t, [-1])
+    t = ops.reduce_mean(t, axis=0)
+    return t
+
+
+def handle_aten_mse_loss_backward(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Backward pass for Mean Squared Error loss"""
+    grad_output = tensors[node.args[0].name]
+    input = tensors[node.args[1].name]
+    target = tensors[node.args[2].name]
+    grad_input = ops.sub(input, target)
+    grad_input = ops.mul(grad_input, grad_output)
+    grad_input = ops.mul(grad_input, 2.0 / grad_input.shape()[0])
+    return grad_input
+
+
+_REGISTRY_FUNCTION_HANDLER: Dict[str, Callable] = {
+    "aten::add.Scalar": handle_aten_add_scalar,
+    "aten::sub.Tensor": handle_aten_sub_tensor,
+    "aten::mul.Tensor": handle_aten_mul_tensor,
+    "aten::t": handle_aten_t,
+    "aten::mm": handle_aten_mm,
+    "aten::addmm": handle_aten_addmm,
+    "aten::silu": handle_aten_silu,
+    "aten::sum.dim_IntList": handle_aten_sum_dim_intlist,
+    "aten::view": handle_aten_view,
+    "aten::sigmoid": handle_aten_sigmoid,
+    "aten::empty_like": handle_aten_empty_like,
+    "aten::fill.Scalar": handle_aten_fill_scalar,
+    "aten::mse_loss": handle_aten_mse_loss,
+    "aten::mse_loss_backward": handle_aten_mse_loss_backward,
+}
+
+
+class Tracer:
+    def __init__(self):
+        self.tensors: Dict[str, Tensor] = {}
+        self.params: List[torch.nn.Parameter] = []
+        self.params_idx: int = 0
+        self.inputs_fw: List[Tensor] = []
+        self.inputs_bw: List[Tensor] = []
+        self.outputs_fw: List[Tensor] = []
+        self.outputs_bw: List[Tensor] = []
+        self.plan_fw: Optional[Plan] = None
+        self.plan_bw: Optional[Plan] = None
+        self.device: Optional[torch.device] = None
+        self.failed: bool = False
+        self.launched_fw: bool = False
+        self.launched_bw: bool = False
+
+    def __call__(self, cls: Type[torch.nn.Module]) -> Type[torch.nn.Module]:
+        cls.forward_torch = cls.forward
+
+        def forward_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
+            if self.plan_fw is None:
+                return instance.forward_torch(*args, **kwargs)
+            rt = Runtime.get_runtime()
+            if not self.launched_fw:
+                rt.launch(
+                    plan=self.plan_fw,
+                    device_id=self.device.index,
+                    loop_mode=False,
+                )
+                self.launched_fw = True
+                self.launched_bw = False
+
+            ph_map = {ph: data for ph, data in zip(self.inputs_fw, args)}
+            rt.run(tensor_mappings=ph_map)
+            # TODO: how to get the output tensor(s)?
+            return self.outputs_fw[0]
+
+        def backward_wrapper(instance: torch.nn.Module, *args, **kwargs):
+            if self.plan_bw is None:
+                return instance.forward_torch(*args, **kwargs)
+            rt = Runtime.get_runtime()
+            if not self.launched_bw:
+                rt.launch(
+                    plan=self.plan_bw,
+                    device_id=self.device.index,
+                    loop_mode=False,
+                )
+                self.launched_bw = True
+                self.launched_fw = False
+
+            ph_map = {ph: data for ph, data in zip(self.inputs_bw, args)}
+            rt.run(tensor_mappings=ph_map)
+            for i, param in enumerate(self.params):
+                param.grad = self.outputs_bw[i].to_torch()
+
+        def call_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
+            @torch._dynamo.optimize(self.autograd_trace_)
+            def call(*args, **kwargs):
+                return instance.forward_torch(*args, **kwargs)
+
+            return call(*args, **kwargs)
+
+        cls.forward_ark = forward_wrapper
+        cls.backward_ark = backward_wrapper
+        cls.__call__ = call_wrapper
+        return cls
+
+    def autograd_trace_(
+        self, gm: torch.nn.Module, _: List[torch.Tensor]
+    ) -> Callable:
+        for _, param in gm.named_parameters(remove_duplicate=False):
+            self.params.append(param)
+        for _, param in gm.named_buffers(remove_duplicate=False):
+            self.params.append(param)
+
+        def fw_compiler(gm: torch.fx.GraphModule, _):
+            logging.info("==== FW Starts ====")
+            return self.autograd_trace_impl_(gm, _, True)
+
+        def bw_compiler(gm: torch.fx.GraphModule, _):
+            logging.info("==== BW Starts ====")
+            return self.autograd_trace_impl_(gm, _, False)
+
+        return torch._dynamo.backends.common.aot_autograd(
+            fw_compiler=fw_compiler, bw_compiler=bw_compiler
+        )(gm, _)
+
+    def autograd_trace_impl_(
+        self, gm: torch.fx.GraphModule, _: List[torch.Tensor], is_fw: bool
+    ) -> Callable:
+        Model.reset()
+        if not self.failed:
+            for node in gm.graph.nodes:
+                logging.info(node.format_node(), node.meta)
+                if not self.handle_node_(node, is_fw):
+                    self.failed = True
+                    break
+            if not self.failed:
+                Model.set_device_id(self.device.index)
+                if is_fw:
+                    self.plan_fw = Planner(self.device.index).plan()
+                else:
+                    self.plan_bw = Planner(self.device.index).plan()
+
+        def boxed_fw(args) -> Any:
+            return gm.forward(*args)
+
+        boxed_fw._boxed_call = True
+        return boxed_fw
+
+    def handle_node_(self, node: torch.fx.node.Node, is_fw: bool) -> bool:
+        if node.op == "placeholder":
+            t = self.tensors.get(node.name, None)
+            if t is not None:
+                return True
+            meta = node.meta["tensor_meta"]
+            if len(self.params) > self.params_idx:
+                # placeholder for parameter
+                param = self.params[self.params_idx]
+                self.params_idx += 1
+                if param.dtype != meta.dtype:
+                    raise ValueError(
+                        f"Expected dtype {meta.dtype}, got {param.dtype}"
+                    )
+                if self.device is None:
+                    self.device = param.device
+                elif self.device != param.device:
+                    raise ValueError(
+                        "All parameters must be on the same device. "
+                        f"Expected {self.device}, got {param.device}"
+                    )
+                data = param.data_ptr()
+            else:
+                # no more parameter -- remainings are inputs
+                data = 0
+            t = ops.placeholder(
+                shape=meta.shape,
+                dtype=ops.DataType.from_torch(meta.dtype),
+                name=node.name,
+                data=data,
+            )
+            self.tensors[node.name] = t
+            if data == 0:
+                if is_fw:
+                    self.inputs_fw.append(t)
+                else:
+                    self.inputs_bw.append(t)
+        elif node.op == "output":
+            outputs_list = self.outputs_fw if is_fw else self.outputs_bw
+            if outputs_list:
+                raise ValueError("Multiple output nodes are unexpected")
+            for out in node.args[0]:
+                if isinstance(out, torch.fx.node.Node):
+                    if out.name not in self.tensors:
+                        raise ValueError(f"Output tensor {out.name} not found")
+                    outputs_list.append(self.tensors[out.name])
+                else:
+                    outputs_list.append(out)
+        elif node.op == "call_function":
+            target_name = node.target.name()
+            if target_name not in _REGISTRY_FUNCTION_HANDLER:
+                logging.warning(
+                    f"Unsupported function {target_name}. Usage: {node.format_node()}"
+                )
+                return False
+            t = _REGISTRY_FUNCTION_HANDLER[target_name](node, self.tensors)
+            self.tensors[node.name] = t
+        else:
+            raise ValueError(f"Unexpected node {node.format_node()}")
+        return True
+
+
+def tracer(cls: Type[torch.nn.Module]):
+    return Tracer()(cls)
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index 3ee851c27..150055ae3 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -176,20 +176,23 @@ void register_executor(py::module &m) {
              py::arg("plan"), py::arg("name") = "executor")
         .def(
             "launch",
-            [](ark::Executor *self, uintptr_t stream, bool loop_mode,
+            [](ark::Executor *self,
                const std::unordered_map<ark::Tensor, uintptr_t>
-                   &placeholder_data) {
+                   &placeholder_data,
+               uintptr_t stream, bool loop_mode, bool record) {
                 std::unordered_map<ark::Tensor, void *> tensor_ptr_map;
                 for (const auto &[tensor, addr] : placeholder_data) {
                     tensor_ptr_map[tensor] = reinterpret_cast<void *>(addr);
                 }
 
-                self->launch(reinterpret_cast<ark::Stream>(stream), loop_mode,
-                             tensor_ptr_map);
+                self->launch(tensor_ptr_map,
+                             reinterpret_cast<ark::Stream>(stream), loop_mode,
+                             record);
             },
-            py::arg("stream") = 0, py::arg("loop_mode") = true,
             py::arg("placeholder_data") =
-                std::unordered_map<ark::Tensor, void *>())
+                std::unordered_map<ark::Tensor, void *>(),
+            py::arg("stream") = 0, py::arg("loop_mode") = true,
+            py::arg("record") = false)
 
         .def(
             "run",
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 704222c63..7c661a292 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -13,61 +13,6 @@
 
 namespace py = pybind11;
 
-struct DLTensorMetadata {
-    void *data_ptr;
-    int32_t device_id;
-    DLDeviceType device_type;
-    int32_t ndim;
-    DLDataType dtype;
-    std::vector<int64_t> shape;
-    std::vector<int64_t> strides;
-    uint64_t byte_offset;
-};
-
-static DLTensorMetadata extractDLTensorMetadata(DLManagedTensor *dl_tensor) {
-    DLTensorMetadata metadata;
-    metadata.data_ptr = dl_tensor->dl_tensor.data;
-    metadata.device_id = dl_tensor->dl_tensor.device.device_id;
-    metadata.device_type = dl_tensor->dl_tensor.device.device_type;
-    metadata.ndim = dl_tensor->dl_tensor.ndim;
-    metadata.dtype = dl_tensor->dl_tensor.dtype;
-    metadata.shape.assign(
-        dl_tensor->dl_tensor.shape,
-        dl_tensor->dl_tensor.shape + dl_tensor->dl_tensor.ndim);
-    if (dl_tensor->dl_tensor.strides != nullptr) {
-        metadata.strides.assign(
-            dl_tensor->dl_tensor.strides,
-            dl_tensor->dl_tensor.strides + dl_tensor->dl_tensor.ndim);
-    }
-    metadata.byte_offset = dl_tensor->dl_tensor.byte_offset;
-    return metadata;
-}
-
-static ark::DataType from_dl_dtype(const DLDataType &dl_dtype) {
-    if (dl_dtype.lanes != 1) {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    ark::DataType ark_dtype;
-    if (dl_dtype.code == kDLFloat && dl_dtype.bits == 32) {
-        ark_dtype = ark::FP32;
-    } else if (dl_dtype.code == kDLFloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::FP16;
-    } else if (dl_dtype.code == kDLBfloat && dl_dtype.bits == 16) {
-        ark_dtype = ark::BF16;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::INT32;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 32) {
-        ark_dtype = ark::UINT32;
-    } else if (dl_dtype.code == kDLInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::INT8;
-    } else if (dl_dtype.code == kDLUInt && dl_dtype.bits == 8) {
-        ark_dtype = ark::UINT8;
-    } else {
-        ERR(ark::UnsupportedError, "unsupported data type");
-    }
-    return ark_dtype;
-}
-
 void register_model(py::module &m) {
     py::class_<ark::Model, ark::ModelGraph>(m, "_Model")
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index 5c28563de..d2506ed38 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -17,7 +17,18 @@ void register_tensor(py::module& m) {
         .def("offsets", &ark::Tensor::offsets)
         .def("padded_shape", &ark::Tensor::padded_shape)
         .def("data_type", &ark::Tensor::data_type)
-        .def("torch_strides", &ark::Tensor::torch_strides);
+        .def("torch_strides", &ark::Tensor::torch_strides)
+        .def("data",
+             [](const ark::Tensor& self) {
+                 return reinterpret_cast<uintptr_t>(self.data());
+             })
+        .def(
+            "data",
+            [](ark::Tensor& self, uintptr_t data) {
+                return self.data(reinterpret_cast<void*>(data));
+            },
+            py::arg("data"))
+        .def("is_external", &ark::Tensor::is_external);
 
     m.attr("_NullTensor") = &ark::NullTensor;
 }

From 7a3191453082a13a24b662d973008517b3c8a980 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 26 Aug 2024 16:58:33 +0000
Subject: [PATCH 075/106] small fix

---
 ark/api/executor.cpp | 3 ++-
 ark/gpu/gpu.hpp      | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index b73dbb9cd..d7e0b1a43 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -307,6 +307,7 @@ void CommResource::connect(const PlanJson &plan_json,
     }
 
     for (auto &[remote_rank, resource] : rank_to_resource_) {
+        auto remote_regmem_id = rank_to_remote_regmem_id[remote_rank];
         auto add_proxy_channel =
             [&](std::shared_ptr<ConnectionResource> conn_resource) {
                 if (!conn_resource) return;
@@ -315,7 +316,7 @@ void CommResource::connect(const PlanJson &plan_json,
                         proxy_service_->proxyChannel(
                             proxy_service_->buildAndAddSemaphore(
                                 *comm_, conn_resource->connection)),
-                        rank_to_remote_regmem_id[remote_rank], regmem_id));
+                        remote_regmem_id, regmem_id));
             };
         // NOTE: We can create multiple proxy channels here if we need in the
         // future
diff --git a/ark/gpu/gpu.hpp b/ark/gpu/gpu.hpp
index 8ff3b2843..dbcd50f3e 100644
--- a/ark/gpu/gpu.hpp
+++ b/ark/gpu/gpu.hpp
@@ -54,7 +54,7 @@ ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunction, CUfunction, hipFunction_t);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuFunctionAttribute, CUfunction_attribute,
                           hipFunction_attribute);
 ARK_GPU_DEFINE_TYPE_ALIAS(gpuPointerAttributes, cudaPointerAttributes,
-                          hipPointerAttributes);
+                          hipPointerAttribute_t);
 
 // runtime API
 ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuSuccess, cudaSuccess, hipSuccess);

From 135929e8f50012ec2dcbcddd36bf5d7074a482ce Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 30 Aug 2024 22:19:25 +0000
Subject: [PATCH 076/106] wip

---
 python/CMakeLists.txt                |  7 ++-
 python/ark/model.py                  | 13 ++++
 python/ark/torch/tracer.py           | 91 ++++++++++++++++++----------
 python/unittest/test_torch_tracer.py | 25 ++++++++
 4 files changed, 100 insertions(+), 36 deletions(-)
 create mode 100644 python/unittest/test_torch_tracer.py

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 597388e2d..5e03f66fd 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -19,12 +19,13 @@ FetchContent_MakeAvailable(pybind11)
 file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 pybind11_add_module(ark_py ${BIND_SOURCES})
 set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ark)
-add_custom_command(TARGET ark_py POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
-)
 target_link_libraries(ark_py PRIVATE ark_static)
 target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
 target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
+add_custom_target(py_copy
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
+)
+add_dependencies(ark_py py_copy)
 
 if(ARK_USE_CUDA)
     target_include_directories(ark_py SYSTEM PRIVATE
diff --git a/python/ark/model.py b/python/ark/model.py
index 2a977b8f3..54dace706 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -70,6 +70,19 @@ def reset():
         _ModelState.rank = 0
         _ModelState.world_size = 1
 
+    def __init__(self, rank: int = 0, world_size: int = 1):
+        """
+        Initialize the model.
+
+        Args:
+            rank: The rank of the model.
+            world_size: The world size of the model.
+        """
+        super().__init__(rank, world_size)
+
+    def __str__(self) -> str:
+        return self.serialize()
+
     def compress(self) -> "Model":
         """
         Compress the model.
diff --git a/python/ark/torch/tracer.py b/python/ark/torch/tracer.py
index 9570fd977..eb73d4e48 100644
--- a/python/ark/torch/tracer.py
+++ b/python/ark/torch/tracer.py
@@ -7,7 +7,7 @@
     raise ImportError("torch is required to use this module")
 
 import logging
-from typing import Type, List, Dict, Optional, Callable, Any
+from typing import List, Dict, Optional, Callable, Any
 
 from ..planner import Planner, Plan
 from ..tensor import Tensor
@@ -28,6 +28,15 @@ def handle_aten_add_scalar(
     return ops.add(t, value, name=node.name)
 
 
+def handle_aten_add_tensor(
+    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
+) -> Tensor:
+    """Element-wise subtraction"""
+    t1 = tensors[node.args[0].name]
+    t2 = tensors[node.args[1].name]
+    return ops.add(t1, t2, name=node.name)
+
+
 def handle_aten_sub_tensor(
     node: torch.fx.node.Node, tensors: Dict[str, Tensor]
 ) -> Tensor:
@@ -163,6 +172,7 @@ def handle_aten_mse_loss_backward(
 
 _REGISTRY_FUNCTION_HANDLER: Dict[str, Callable] = {
     "aten::add.Scalar": handle_aten_add_scalar,
+    "aten::add.Tensor": handle_aten_add_tensor,
     "aten::sub.Tensor": handle_aten_sub_tensor,
     "aten::mul.Tensor": handle_aten_mul_tensor,
     "aten::t": handle_aten_t,
@@ -182,7 +192,7 @@ def handle_aten_mse_loss_backward(
 class Tracer:
     def __init__(self):
         self.tensors: Dict[str, Tensor] = {}
-        self.params: List[torch.nn.Parameter] = []
+        self.params: Optional[List[torch.nn.Parameter]] = None
         self.params_idx: int = 0
         self.inputs_fw: List[Tensor] = []
         self.inputs_bw: List[Tensor] = []
@@ -195,8 +205,15 @@ def __init__(self):
         self.launched_fw: bool = False
         self.launched_bw: bool = False
 
-    def __call__(self, cls: Type[torch.nn.Module]) -> Type[torch.nn.Module]:
-        cls.forward_torch = cls.forward
+    def __call__(self, target: Callable) -> Callable:
+        is_module = issubclass(target, torch.nn.Module)
+        is_function = callable(target) and not isinstance(target, type)
+        if not is_module and not is_function:
+            raise ValueError("Tracer can only be applied to a subclass of `torch.nn.Module` or a function")
+        if is_function:
+            return torch._dynamo.optimize(self.autograd_trace_)(target)
+
+        target.forward_torch = target.forward
 
         def forward_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
             if self.plan_fw is None:
@@ -235,25 +252,28 @@ def backward_wrapper(instance: torch.nn.Module, *args, **kwargs):
                 param.grad = self.outputs_bw[i].to_torch()
 
         def call_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
+            if self.params is None:
+                params = []
+                for _, param in instance.named_parameters(remove_duplicate=False):
+                    params.append(param)
+                for _, param in instance.named_buffers(remove_duplicate=False):
+                    params.append(param)
+                self.params = params
+
             @torch._dynamo.optimize(self.autograd_trace_)
             def call(*args, **kwargs):
                 return instance.forward_torch(*args, **kwargs)
 
             return call(*args, **kwargs)
 
-        cls.forward_ark = forward_wrapper
-        cls.backward_ark = backward_wrapper
-        cls.__call__ = call_wrapper
-        return cls
+        target.forward_ark = forward_wrapper
+        target.backward_ark = backward_wrapper
+        target.__call__ = call_wrapper
+        return target
 
     def autograd_trace_(
         self, gm: torch.nn.Module, _: List[torch.Tensor]
     ) -> Callable:
-        for _, param in gm.named_parameters(remove_duplicate=False):
-            self.params.append(param)
-        for _, param in gm.named_buffers(remove_duplicate=False):
-            self.params.append(param)
-
         def fw_compiler(gm: torch.fx.GraphModule, _):
             logging.info("==== FW Starts ====")
             return self.autograd_trace_impl_(gm, _, True)
@@ -269,25 +289,26 @@ def bw_compiler(gm: torch.fx.GraphModule, _):
     def autograd_trace_impl_(
         self, gm: torch.fx.GraphModule, _: List[torch.Tensor], is_fw: bool
     ) -> Callable:
-        Model.reset()
-        if not self.failed:
-            for node in gm.graph.nodes:
-                logging.info(node.format_node(), node.meta)
-                if not self.handle_node_(node, is_fw):
-                    self.failed = True
-                    break
-            if not self.failed:
-                Model.set_device_id(self.device.index)
-                if is_fw:
-                    self.plan_fw = Planner(self.device.index).plan()
-                else:
-                    self.plan_bw = Planner(self.device.index).plan()
-
-        def boxed_fw(args) -> Any:
-            return gm.forward(*args)
 
-        boxed_fw._boxed_call = True
-        return boxed_fw
+        def run(args) -> Any:
+            Model.reset()
+            if not self.failed:
+                for node in gm.graph.nodes:
+                    logging.info(node.format_node(), node.meta)
+                    if not self.handle_node_(node, is_fw):
+                        print(f"Failed to handle node {node.format_node()}")
+                        self.failed = True
+                        break
+                if not self.failed:
+                    Model.set_device_id(self.device.index)
+                    if is_fw:
+                        self.plan_fw = Planner(self.device.index).plan()
+                    else:
+                        self.plan_bw = Planner(self.device.index).plan()
+            return torch.fx.Interpreter(gm).boxed_run(args)
+
+        run._boxed_call = True
+        return run
 
     def handle_node_(self, node: torch.fx.node.Node, is_fw: bool) -> bool:
         if node.op == "placeholder":
@@ -304,6 +325,10 @@ def handle_node_(self, node: torch.fx.node.Node, is_fw: bool) -> bool:
                         f"Expected dtype {meta.dtype}, got {param.dtype}"
                     )
                 if self.device is None:
+                    if param.device.type != "cuda":
+                        raise ValueError(
+                            f"Expected device cuda, got {param.device.type}"
+                        )
                     self.device = param.device
                 elif self.device != param.device:
                     raise ValueError(
@@ -351,5 +376,5 @@ def handle_node_(self, node: torch.fx.node.Node, is_fw: bool) -> bool:
         return True
 
 
-def tracer(cls: Type[torch.nn.Module]):
-    return Tracer()(cls)
+def tracer(target: Callable):
+    return Tracer()(target)
diff --git a/python/unittest/test_torch_tracer.py b/python/unittest/test_torch_tracer.py
new file mode 100644
index 000000000..e8cb009e6
--- /dev/null
+++ b/python/unittest/test_torch_tracer.py
@@ -0,0 +1,25 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from unittest_common import ark, pytest_ark
+
+
+@pytest_ark(need_torch=True)
+def test_torch_tracer_module():
+    import torch
+    from ark.torch.tracer import tracer
+
+    @tracer
+    class TestModule(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.param = torch.nn.Parameter(torch.randn(1024, 1024))
+
+        def forward(self, x: torch.Tensor) -> torch.Tensor:
+            return x + self.param
+
+    model = TestModule().to("cuda:0")
+    x = torch.randn(1024, 1024).to("cuda:0")
+    y = model(x)
+    y2 = model.forward_ark(x)
+    assert torch.allclose(y, y2)

From 8b4603933c325381496df0edbfaaf642586c145e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 07:04:08 +0000
Subject: [PATCH 077/106] Python logging & changed Python class names

---
 .dockerignore                     |   1 +
 .github/workflows/codeql.yml      |   4 +-
 .github/workflows/ut-cuda.yml     |   8 +-
 .vscode/settings.json             |   2 -
 CMakeLists.txt                    |   1 +
 ark/api/log.cpp                   |  15 ++
 ark/include/ark/log.hpp           |  18 +++
 ark/logging.hpp                   |  36 ++---
 ark/ops/ops_test_common.hpp       |   2 +-
 ark/unittest/unittest_utils.cpp   |   6 +-
 ark/unittest/unittest_utils.h     | 240 +++++++++++++++---------------
 python/CMakeLists.txt             |  20 ++-
 python/ark/__init__.py            |  23 +--
 python/ark/data_type.py           |  48 ++++--
 python/ark/error.py               |  16 +-
 python/ark/init.py                |  14 +-
 python/ark/log.py                 |  27 ++++
 python/ark/model.py               |  33 ++--
 python/ark/module.py              |   9 +-
 python/ark/ops.py                 | 117 ++++++++-------
 python/ark/planner.py             |  12 +-
 python/ark/runtime.py             |  29 ++--
 python/ark/tensor.py              |  12 +-
 python/ark_py.cpp                 |   4 +-
 python/data_type_py.cpp           |   2 +-
 python/dims_py.cpp                |   2 +-
 python/error_py.cpp               |   7 +-
 python/executor_py.cpp            |   2 +-
 python/init_py.cpp                |   1 -
 python/log_py.cpp                 |  20 +++
 python/model_graph_py.cpp         |   2 +-
 python/model_py.cpp               |   2 +-
 python/planner_py.cpp             |   8 +-
 python/tensor_py.cpp              |   4 +-
 python/unittest/common.py         |  28 ++++
 python/unittest/test.py           |   7 +-
 python/unittest/test_data_type.py |  75 ++++++++++
 python/unittest/test_error.py     |   9 +-
 python/unittest/test_model.py     |   5 +-
 python/unittest/test_runtime.py   |  16 +-
 40 files changed, 549 insertions(+), 338 deletions(-)
 create mode 100644 ark/api/log.cpp
 create mode 100644 ark/include/ark/log.hpp
 create mode 100644 python/ark/log.py
 create mode 100644 python/log_py.cpp
 create mode 100644 python/unittest/common.py
 create mode 100644 python/unittest/test_data_type.py

diff --git a/.dockerignore b/.dockerignore
index e47f48873..60583dbf9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -6,6 +6,7 @@ build/
 *.pyc
 *.pyo
 *.pyd
+.pytest_cache/
 
 # Git
 **/.git
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 509ac6d48..7ac2f1649 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -49,7 +49,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
+        cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_CUDA=ON -DARK_BUILD_TESTS=OFF ..
         make build ark_py
 
     - name: Perform CodeQL Analysis
@@ -95,7 +95,7 @@ jobs:
     - name: Build
       run: |
         mkdir build && cd build
-        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
+        CXX=/opt/rocm/bin/hipcc cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BYPASS_GPU_CHECK=ON -DARK_USE_ROCM=ON -DARK_BUILD_TESTS=OFF ..
         make -j build ark_py
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/ut-cuda.yml b/.github/workflows/ut-cuda.yml
index 363f1b771..10b0679da 100644
--- a/.github/workflows/ut-cuda.yml
+++ b/.github/workflows/ut-cuda.yml
@@ -44,7 +44,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          cmake -DCMAKE_BUILD_TYPE=Debug -DARK_BUILD_PYTHON=ON ..
+          cmake -DCMAKE_BUILD_TYPE=Debug ..
           make -j ut ark_py
 
       - name: Run C++ UT
@@ -71,7 +71,11 @@ jobs:
       - name: Run Python UT
         run: |
           cd build
-          ARK_ROOT=$PWD pytest --cov=../python/ark --cov-report lcov:py_coverage.info --verbose ../python/unittest/test.py
+          PYTHONPATH=$PWD/python ARK_ROOT=$PWD python3 -m pytest \
+              --cov=python/ark \
+              --cov-report lcov:py_coverage.info \
+              --verbose \
+              ../python/unittest/test.py
 
       - name: Report Coverage
         env:
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 640196a66..00260f078 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -3,8 +3,6 @@
     "cmake.environment": {
         "ARK_ROOT": "${workspaceFolder}/build",
         "ARK_IGNORE_BINARY_CACHE": "1",
-        "ARK_DISABLE_GRAPH_OPT": "0",
-        "ARK_IPC_LISTEN_PORT_BASE": "42000",
         // "ARK_LOG_LEVEL": "DEBUG"
     },
     "cmake.ctestArgs": [
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2e80ea1e8..c3b09b0e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -17,6 +17,7 @@ option(ARK_USE_CUDA "Use NVIDIA/CUDA." OFF)
 option(ARK_USE_ROCM "Use AMD/ROCm." OFF)
 option(ARK_BYPASS_GPU_CHECK "Bypass GPU check." OFF)
 option(ARK_BUILD_TESTS "Build unit tests." ON)
+option(ARK_BUILD_PYTHON "Build Python module." ON)
 
 if(ARK_BYPASS_GPU_CHECK)
     if(ARK_USE_CUDA)
diff --git a/ark/api/log.cpp b/ark/api/log.cpp
new file mode 100644
index 000000000..6749c4987
--- /dev/null
+++ b/ark/api/log.cpp
@@ -0,0 +1,15 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include "ark/log.hpp"
+
+#include "logging.hpp"
+
+namespace ark {
+
+void log(LogLevel level, const std::string &file, int line,
+         const std::string &msg) {
+    _log(level, file, line, msg);
+}
+
+}  // namespace ark
diff --git a/ark/include/ark/log.hpp b/ark/include/ark/log.hpp
new file mode 100644
index 000000000..77f5b48bf
--- /dev/null
+++ b/ark/include/ark/log.hpp
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#ifndef ARK_LOG_HPP
+#define ARK_LOG_HPP
+
+#include <string>
+
+namespace ark {
+
+typedef enum { DEBUG, INFO, WARN, ERROR } LogLevel;
+
+void log(LogLevel level, const std::string &file, int line,
+         const std::string &msg);
+
+}  // namespace ark
+
+#endif  // ARK_LOG_HPP
diff --git a/ark/logging.hpp b/ark/logging.hpp
index db44dc581..c5d2572ba 100644
--- a/ark/logging.hpp
+++ b/ark/logging.hpp
@@ -9,11 +9,10 @@
 #include <string>
 
 #include "ark/error.hpp"
+#include "ark/log.hpp"
 
 namespace ark {
 
-typedef enum { DEBUG, INFO, WARN, ERROR } LogLevel;
-
 class Logging {
    public:
     Logging(const std::string &lv);
@@ -46,43 +45,44 @@ void _log_helper(std::stringstream &ss, T value, Args... args) {
     _log_helper(ss, args...);
 }
 
-template <LogLevel Level, bool AppendNewLine, typename T, typename... Args>
-inline std::string _log_msg(const std::string &file, int line, T value,
+template <typename T, typename... Args>
+inline std::string _log_msg(LogLevel level, bool append_newline,
+                            const std::string &file, int line, T value,
                             Args... args) {
     std::stringstream ss;
-    _log_header(ss, Level, file, line);
+    _log_header(ss, level, file, line);
     _log_helper(ss, value, args...);
-    if constexpr (AppendNewLine) ss << std::endl;
+    if (append_newline) ss << std::endl;
     return ss.str();
 }
 
-template <LogLevel Level, typename T, typename... Args>
-inline void _log(const std::string &file, int line, T value, Args... args) {
-    if (Level >= get_logging().get_level()) {
-        std::clog << _log_msg<Level, true>(file, line, value, args...);
+template <typename T, typename... Args>
+inline void _log(LogLevel level, const std::string &file, int line, T value,
+                 Args... args) {
+    if (level >= get_logging().get_level()) {
+        std::clog << _log_msg(level, true, file, line, value, args...);
     }
-    if constexpr (Level == ERROR) {
+    if (level == ERROR) {
         throw std::runtime_error("ARK runtime error");
     }
 }
 
 template <typename Exception, typename T, typename... Args>
 inline void _err(const std::string &file, int line, T value, Args... args) {
-    throw Exception(_log_msg<ERROR, false>(file, line, value, args...));
+    throw Exception(_log_msg(ERROR, false, file, line, value, args...));
 }
 
 // Logging.
 #define LOG(level, ...)                                    \
     do {                                                   \
-        ark::_log<level>(__FILE__, __LINE__, __VA_ARGS__); \
+        ark::_log(level, __FILE__, __LINE__, __VA_ARGS__); \
         break;                                             \
     } while (0)
 
-#define ERR(exception, ...)                                             \
-    do {                                                                \
-        std::string exc_str = " (" #exception ")";                      \
-        ark::_err<exception>(__FILE__, __LINE__, __VA_ARGS__, exc_str); \
-        break;                                                          \
+#define ERR(exception, ...)                                    \
+    do {                                                       \
+        ark::_err<exception>(__FILE__, __LINE__, __VA_ARGS__); \
+        break;                                                 \
     } while (0)
 
 #define CHECK(cond)                                                  \
diff --git a/ark/ops/ops_test_common.hpp b/ark/ops/ops_test_common.hpp
index 3848773e6..12fb88a7b 100644
--- a/ark/ops/ops_test_common.hpp
+++ b/ark/ops/ops_test_common.hpp
@@ -60,7 +60,7 @@ float reduction_abs_error_bound(float max_abs, int reduction_length) {
     // If the reduction length is too large, the error will be dominated by
     // the rounding error of the reduction itself.
     if (reduction_length > (1 << (NumFracBits + 1))) {
-        UNITTEST_FEXIT("reduction length is too large");
+        UNITTEST_FAIL("reduction length is too large");
     }
     float max_diff =
         reduction_length * 2 * max_abs * 1.0f / (1 << (NumFracBits + 1));
diff --git a/ark/unittest/unittest_utils.cpp b/ark/unittest/unittest_utils.cpp
index 62725939f..4b74f9513 100644
--- a/ark/unittest/unittest_utils.cpp
+++ b/ark/unittest/unittest_utils.cpp
@@ -16,7 +16,7 @@
 // Grep SIGALRM and exit.
 static void sigalrm_timeout_handler(int) {
     signal(SIGALRM, SIG_IGN);
-    UNITTEST_FEXIT("timeout");
+    UNITTEST_FAIL("timeout");
 }
 
 namespace ark {
@@ -64,7 +64,7 @@ void wait_all_threads() {
 int spawn_process(std::function<State()> func) {
     pid_t pid = fork();
     if (pid < 0) {
-        UNITTEST_UEXIT("fork() failed");
+        UNITTEST_UNEXPECTED("fork() failed");
     } else if (pid == 0) {
         State ret = func();
         std::exit(ret);
@@ -82,7 +82,7 @@ void wait_all_processes() {
         do {
             pid = wait(&status);
             if (pid == -1) {
-                UNITTEST_UEXIT("wait() failed");
+                UNITTEST_UNEXPECTED("wait() failed");
             }
         } while (!WIFEXITED(status));
         status = WEXITSTATUS(status);
diff --git a/ark/unittest/unittest_utils.h b/ark/unittest/unittest_utils.h
index 423c8536e..383f49b6d 100644
--- a/ark/unittest/unittest_utils.h
+++ b/ark/unittest/unittest_utils.h
@@ -80,161 +80,161 @@ std::string get_kernel_code(const std::string &name);
     } while (0)
 
 // Fail the test.
-#define UNITTEST_FEXIT(...) UNITTEST_EXIT(ark::unittest::FAILURE, __VA_ARGS__)
+#define UNITTEST_FAIL(...) UNITTEST_EXIT(ark::unittest::FAILURE, __VA_ARGS__)
 
-// Unexpected error during test.
-#define UNITTEST_UEXIT(...) \
+// Unexpected error occurred inside the unittest framework.
+#define UNITTEST_UNEXPECTED(...) \
     UNITTEST_EXIT(ark::unittest::UNEXPECTED, __VA_ARGS__)
 
 // Success.
-#define UNITTEST_SEXIT() UNITTEST_EXIT(ark::unittest::SUCCESS, "")
+#define UNITTEST_SUCCESS() UNITTEST_EXIT(ark::unittest::SUCCESS, "")
 
 // Check if the given condition is true.
-#define UNITTEST_TRUE(cond)                               \
-    do {                                                  \
-        if (cond) {                                       \
-            break;                                        \
-        }                                                 \
-        UNITTEST_FEXIT("condition `" #cond "` is false"); \
+#define UNITTEST_TRUE(cond)                              \
+    do {                                                 \
+        if (cond) {                                      \
+            break;                                       \
+        }                                                \
+        UNITTEST_FAIL("condition `" #cond "` is false"); \
     } while (0)
 
 // Check if the given condition is false.
-#define UNITTEST_FALSE(cond)                                 \
-    do {                                                     \
-        if (cond) {                                          \
-            UNITTEST_FEXIT("condition `" #cond "` is true"); \
-        }                                                    \
-        break;                                               \
+#define UNITTEST_FALSE(cond)                                \
+    do {                                                    \
+        if (cond) {                                         \
+            UNITTEST_FAIL("condition `" #cond "` is true"); \
+        }                                                   \
+        break;                                              \
     } while (0)
 
 // Check if the given expressions are equal.
-#define UNITTEST_EQ(exp0, exp1)                                \
-    do {                                                       \
-        auto _v0 = (exp0);                                     \
-        auto _v1 = (exp1);                                     \
-        if (_v0 == static_cast<decltype(_v0)>(_v1)) {          \
-            break;                                             \
-        }                                                      \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,            \
-                       ") != `" #exp1 "` (value: ", _v1, ")"); \
+#define UNITTEST_EQ(exp0, exp1)                               \
+    do {                                                      \
+        auto _v0 = (exp0);                                    \
+        auto _v1 = (exp1);                                    \
+        if (_v0 == static_cast<decltype(_v0)>(_v1)) {         \
+            break;                                            \
+        }                                                     \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,            \
+                      ") != `" #exp1 "` (value: ", _v1, ")"); \
     } while (0)
 
 // Check if the given expressions are not equal.
-#define UNITTEST_NE(exp0, exp1)                                \
-    do {                                                       \
-        auto _v0 = (exp0);                                     \
-        auto _v1 = (exp1);                                     \
-        if (_v0 != static_cast<decltype(_v0)>(_v1)) {          \
-            break;                                             \
-        }                                                      \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,            \
-                       ") == `" #exp1 "` (value: ", _v1, ")"); \
+#define UNITTEST_NE(exp0, exp1)                               \
+    do {                                                      \
+        auto _v0 = (exp0);                                    \
+        auto _v1 = (exp1);                                    \
+        if (_v0 != static_cast<decltype(_v0)>(_v1)) {         \
+            break;                                            \
+        }                                                     \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,            \
+                      ") == `" #exp1 "` (value: ", _v1, ")"); \
     } while (0)
 
 // Check if the `exp0` is less than `exp1`.
-#define UNITTEST_LT(exp0, exp1)                                \
-    do {                                                       \
-        auto _v0 = (exp0);                                     \
-        auto _v1 = (exp1);                                     \
-        if (_v0 < static_cast<decltype(_v0)>(_v1)) {           \
-            break;                                             \
-        }                                                      \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,            \
-                       ") >= `" #exp1 "` (value: ", _v1, ")"); \
-    } while (0)
-
-// Check if the `exp0` is less than or equal to `exp1`.
-#define UNITTEST_LE(exp0, exp1)                               \
+#define UNITTEST_LT(exp0, exp1)                               \
     do {                                                      \
         auto _v0 = (exp0);                                    \
         auto _v1 = (exp1);                                    \
-        if (_v0 <= static_cast<decltype(_v0)>(_v1)) {         \
+        if (_v0 < static_cast<decltype(_v0)>(_v1)) {          \
             break;                                            \
         }                                                     \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,           \
-                       ") > `" #exp1 "` (value: ", _v1, ")"); \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,            \
+                      ") >= `" #exp1 "` (value: ", _v1, ")"); \
     } while (0)
 
-// Check if the `exp0` is greater than `exp1`.
-#define UNITTEST_GT(exp0, exp1)                                \
-    do {                                                       \
-        auto _v0 = (exp0);                                     \
-        auto _v1 = (exp1);                                     \
-        if (_v0 > static_cast<decltype(_v0)>(_v1)) {           \
-            break;                                             \
-        }                                                      \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,            \
-                       ") <= `" #exp1 "` (value: ", _v1, ")"); \
+// Check if the `exp0` is less than or equal to `exp1`.
+#define UNITTEST_LE(exp0, exp1)                              \
+    do {                                                     \
+        auto _v0 = (exp0);                                   \
+        auto _v1 = (exp1);                                   \
+        if (_v0 <= static_cast<decltype(_v0)>(_v1)) {        \
+            break;                                           \
+        }                                                    \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,           \
+                      ") > `" #exp1 "` (value: ", _v1, ")"); \
     } while (0)
 
-// Check if the `exp0` is greater than or equal to `exp1`.
-#define UNITTEST_GE(exp0, exp1)                               \
+// Check if the `exp0` is greater than `exp1`.
+#define UNITTEST_GT(exp0, exp1)                               \
     do {                                                      \
         auto _v0 = (exp0);                                    \
         auto _v1 = (exp1);                                    \
-        if (_v0 >= static_cast<decltype(_v0)>(_v1)) {         \
+        if (_v0 > static_cast<decltype(_v0)>(_v1)) {          \
             break;                                            \
         }                                                     \
-        UNITTEST_FEXIT("`" #exp0 "` (value: ", _v0,           \
-                       ") < `" #exp1 "` (value: ", _v1, ")"); \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,            \
+                      ") <= `" #exp1 "` (value: ", _v1, ")"); \
+    } while (0)
+
+// Check if the `exp0` is greater than or equal to `exp1`.
+#define UNITTEST_GE(exp0, exp1)                              \
+    do {                                                     \
+        auto _v0 = (exp0);                                   \
+        auto _v1 = (exp1);                                   \
+        if (_v0 >= static_cast<decltype(_v0)>(_v1)) {        \
+            break;                                           \
+        }                                                    \
+        UNITTEST_FAIL("`" #exp0 "` (value: ", _v0,           \
+                      ") < `" #exp1 "` (value: ", _v1, ")"); \
     } while (0)
 
 // Check if the given expression throws a given exception.
-#define UNITTEST_THROW(exp, exception)                                        \
-    do {                                                                      \
-        try {                                                                 \
-            (exp);                                                            \
-        } catch (const ark::InternalError &e) {                               \
-            if (std::is_same<ark::InternalError, exception>::value) {         \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a InternalError"); \
-        } catch (const ark::InvalidUsageError &e) {                           \
-            if (std::is_same<ark::InvalidUsageError, exception>::value) {     \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp                                           \
-                           "` unexpectedly throws an InvalidUsageError");     \
-        } catch (const ark::ModelError &e) {                                  \
-            if (std::is_same<ark::ModelError, exception>::value) {            \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a ModelError");    \
-        } catch (const ark::PlanError &e) {                                   \
-            if (std::is_same<ark::PlanError, exception>::value) {             \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a PlanError");     \
-        } catch (const ark::UnsupportedError &e) {                            \
-            if (std::is_same<ark::UnsupportedError, exception>::value) {      \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp                                           \
-                           "` unexpectedly throws an UnsupportedError");      \
-        } catch (const ark::SystemError &e) {                                 \
-            if (std::is_same<ark::SystemError, exception>::value) {           \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a SystemError");   \
-        } catch (const ark::GpuError &e) {                                    \
-            if (std::is_same<ark::GpuError, exception>::value) {              \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a GpuError");      \
-        } catch (const ark::UnitTestError &e) {                               \
-            if (std::is_same<ark::UnitTestError, exception>::value) {         \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a UnitTestError"); \
-        } catch (const ark::BaseError &e) {                                   \
-            if (std::is_same<ark::BaseError, exception>::value) {             \
-                break;                                                        \
-            }                                                                 \
-            UNITTEST_FEXIT("`" #exp "` unexpectedly throws a BaseError");     \
-        } catch (...) {                                                       \
-            UNITTEST_FEXIT("`" #exp "` throws an unknown exception");         \
-        }                                                                     \
-        UNITTEST_FEXIT("`" #exp "` does not throw");                          \
+#define UNITTEST_THROW(exp, exception)                                       \
+    do {                                                                     \
+        try {                                                                \
+            (exp);                                                           \
+        } catch (const ark::InternalError &e) {                              \
+            if (std::is_same<ark::InternalError, exception>::value) {        \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a InternalError"); \
+        } catch (const ark::InvalidUsageError &e) {                          \
+            if (std::is_same<ark::InvalidUsageError, exception>::value) {    \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp                                           \
+                          "` unexpectedly throws an InvalidUsageError");     \
+        } catch (const ark::ModelError &e) {                                 \
+            if (std::is_same<ark::ModelError, exception>::value) {           \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a ModelError");    \
+        } catch (const ark::PlanError &e) {                                  \
+            if (std::is_same<ark::PlanError, exception>::value) {            \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a PlanError");     \
+        } catch (const ark::UnsupportedError &e) {                           \
+            if (std::is_same<ark::UnsupportedError, exception>::value) {     \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp                                           \
+                          "` unexpectedly throws an UnsupportedError");      \
+        } catch (const ark::SystemError &e) {                                \
+            if (std::is_same<ark::SystemError, exception>::value) {          \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a SystemError");   \
+        } catch (const ark::GpuError &e) {                                   \
+            if (std::is_same<ark::GpuError, exception>::value) {             \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a GpuError");      \
+        } catch (const ark::UnitTestError &e) {                              \
+            if (std::is_same<ark::UnitTestError, exception>::value) {        \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a UnitTestError"); \
+        } catch (const ark::BaseError &e) {                                  \
+            if (std::is_same<ark::BaseError, exception>::value) {            \
+                break;                                                       \
+            }                                                                \
+            UNITTEST_FAIL("`" #exp "` unexpectedly throws a BaseError");     \
+        } catch (...) {                                                      \
+            UNITTEST_FAIL("`" #exp "` throws an unknown exception");         \
+        }                                                                    \
+        UNITTEST_FAIL("`" #exp "` does not throw");                          \
     } while (0)
 
 // Log a message.
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index efb9aea3e..2e6ce51a5 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,5 +18,23 @@ FetchContent_MakeAvailable(pybind11)
 
 file(GLOB_RECURSE BIND_SOURCES CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
 pybind11_add_module(ark_py ${BIND_SOURCES})
-set_target_properties(ark_py PROPERTIES OUTPUT_NAME _ark_core)
+set_target_properties(ark_py PROPERTIES OUTPUT_NAME core LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/ark)
 target_link_libraries(ark_py PRIVATE ark_static)
+target_include_directories(ark_py SYSTEM PRIVATE ${DLPACK_INCLUDE_DIRS})
+target_include_directories(ark_py PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../ark)
+add_custom_target(py_copy
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/ark ${CMAKE_CURRENT_BINARY_DIR}/ark
+)
+add_dependencies(ark_py py_copy)
+
+if(ARK_USE_CUDA)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        ${CUDAToolkit_INCLUDE_DIRS}
+    )
+endif()
+
+if(ARK_USE_ROCM)
+    target_include_directories(ark_py SYSTEM PRIVATE
+        /opt/rocm/include
+    )
+endif()
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index b1d0f7873..24e4acfc4 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -1,19 +1,16 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
 import os
 
 if os.environ.get("ARK_ROOT", None) is None:
     os.environ["ARK_ROOT"] = os.path.abspath(os.path.dirname(__file__))
 
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-
-import _ark_core
+from .core import version
 from .model import Model
 
 
-__version__ = _ark_core.version()
+__version__ = version()
 
 
 def version():
@@ -21,11 +18,6 @@ def version():
     return __version__
 
 
-def srand(seed):
-    """Sets the seed for random number generation."""
-    _ark_core.srand(seed)
-
-
 def set_rank(rank):
     """Sets the rank of the current process."""
     Model.set_rank(rank)
@@ -41,16 +33,7 @@ def set_world_size(world_size):
 from .module import Module
 from .runtime import Runtime
 from .serialize import save, load
-from .data_type import (
-    DataType,
-    fp16,
-    fp32,
-    int32,
-    uint32,
-    int8,
-    uint8,
-    byte,
-)
+from .data_type import *
 from .ops import *
 from .planner import *
 from .error import *
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index fe95d0d88..4638cf972 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -2,10 +2,21 @@
 # Licensed under the MIT license.
 
 import numpy
-from . import _ark_core
+from . import core
+from . import log
 
+__all__ = [
+    "DataType",
+    "fp16",
+    "fp32",
+    "int32",
+    "uint32",
+    "int8",
+    "uint8",
+]
 
-_REGISTRY_DATA_TYPE = {
+
+REGISTRY_DATA_TYPE = {
     "fp32": {"np": numpy.float32},
     "fp16": {"np": numpy.float16},
     "bf16": {"np": None},
@@ -13,19 +24,16 @@
     "uint32": {"np": numpy.uint32},
     "int8": {"np": numpy.int8},
     "uint8": {"np": numpy.uint8},
-    "byte": {"np": numpy.ubyte},
 }
 
 
 class MetaDataType(type):
     def __new__(cls, name, bases, attrs):
         new_class = super().__new__(cls, name, bases, attrs)
-        if name in _REGISTRY_DATA_TYPE:
-            reg = _REGISTRY_DATA_TYPE[name]
+        if name in REGISTRY_DATA_TYPE:
+            reg = REGISTRY_DATA_TYPE[name]
             new_class.to_numpy = staticmethod(lambda: reg["np"])
-            new_class.ctype = staticmethod(
-                lambda: getattr(_ark_core, name.upper())
-            )
+            new_class.ctype = staticmethod(lambda: getattr(core, name.upper()))
             new_class.element_size = staticmethod(
                 lambda: new_class.ctype().bytes()
             )
@@ -49,12 +57,16 @@ def from_numpy(np_type: numpy.dtype) -> "DataType":
             DataType: The corresponding ark data type.
 
         Raises:
-            ValueError: If there is no defined conversion from numpy data type to ark data type.
+            InvalidUsageError: If there is no defined conversion from numpy data type to ark data type.
         """
-        for type_name, reg in _REGISTRY_DATA_TYPE.items():
+        if not isinstance(np_type, numpy.dtype):
+            raise log.InvalidUsageError(
+                f"Expected a numpy data type, but got {type(np_type)}"
+            )
+        for type_name, reg in REGISTRY_DATA_TYPE.items():
             if reg["np"] == np_type:
                 return DataType.from_name(type_name)
-        raise ValueError(
+        raise log.InvalidUsageError(
             f"Undefined conversion from numpy data type {np_type}"
             f" to ark data type."
         )
@@ -75,16 +87,16 @@ def from_name(type_name: str) -> "DataType":
         """
         ret = globals().get(type_name, None)
         if ret is None:
-            raise ValueError(f"Undefined data type {type_name}")
+            raise log.InvalidUsageError(f"Undefined data type {type_name}")
         return ret
 
     @staticmethod
-    def from_ctype(ctype: _ark_core._DataType) -> "DataType":
+    def from_ctype(ctype: core.CoreDataType) -> "DataType":
         """
         Return the corresponding ark data type.
 
         Parameters:
-            ctype (_ark_core._DataType): The cpp type.
+            ctype (core.CoreDataType): The cpp type.
 
         Returns:
             DataType: The corresponding ark data type.
@@ -92,6 +104,10 @@ def from_ctype(ctype: _ark_core._DataType) -> "DataType":
         Raises:
             ValueError: If the data type is not defined.
         """
+        if not isinstance(ctype, core.CoreDataType):
+            raise log.InvalidUsageError(
+                f"Expected a core data type, but got {type(ctype)}"
+            )
         return DataType.from_name(ctype.name().lower())
 
     @staticmethod
@@ -105,12 +121,12 @@ def to_numpy() -> numpy.dtype:
         ...
 
     @staticmethod
-    def ctype() -> _ark_core._DataType:
+    def ctype() -> core.CoreDataType:
         """
         Return the corresponding cpp type.
 
         Returns:
-            _ark_core._DataType: The corresponding cpp type.
+            core.CoreDataType: The corresponding cpp type.
         """
         ...
 
diff --git a/python/ark/error.py b/python/ark/error.py
index 4ffe6a3f8..b92f4e71c 100644
--- a/python/ark/error.py
+++ b/python/ark/error.py
@@ -1,14 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from _ark_core import _BaseError as BaseError
-from _ark_core import _InternalError as InternalError
-from _ark_core import _InvalidUsageError as InvalidUsageError
-from _ark_core import _ModelError as ModelError
-from _ark_core import _PlanError as PlanError
-from _ark_core import _UnsupportedError as UnsupportedError
-from _ark_core import _SystemError as SystemError
-from _ark_core import _GpuError as GpuError
+from .core import BaseError
+from .core import InternalError
+from .core import InvalidUsageError
+from .core import ModelError
+from .core import PlanError
+from .core import UnsupportedError
+from .core import SystemError
+from .core import GpuError
 
 __all__ = [
     "BaseError",
diff --git a/python/ark/init.py b/python/ark/init.py
index be71e8e02..f8e226ad1 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -1,15 +1,17 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import _ark_core
+from . import core
 from .model import Model
-from .runtime import _RuntimeState
+from .runtime import RuntimeState
+
+__all__ = ["init"]
 
 
 def init():
     """Initializes ARK."""
     Model.reset()
-    if _RuntimeState.executor is not None:
-        if not _RuntimeState.executor.destroyed():
-            _RuntimeState.executor.destroy()
-    _ark_core.init()
+    if RuntimeState.executor is not None:
+        if not RuntimeState.executor.destroyed():
+            RuntimeState.executor.destroy()
+    core.init()
diff --git a/python/ark/log.py b/python/ark/log.py
new file mode 100644
index 000000000..6dba218ed
--- /dev/null
+++ b/python/ark/log.py
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import inspect
+from .core import LogLevel, log
+from .error import *
+from .error import __all__ as error_all
+
+__all__ = [*error_all, "DEBUG", "INFO", "WARN"]
+
+
+def DEBUG(msg: str) -> None:
+    frame = inspect.currentframe().f_back
+    info = inspect.getframeinfo(frame)
+    log(LogLevel.DEBUG, info.filename, info.lineno, msg)
+
+
+def INFO(msg: str) -> None:
+    frame = inspect.currentframe().f_back
+    info = inspect.getframeinfo(frame)
+    log(LogLevel.INFO, info.filename, info.lineno, msg)
+
+
+def WARN(msg: str) -> None:
+    frame = inspect.currentframe().f_back
+    info = inspect.getframeinfo(frame)
+    log(LogLevel.WARN, info.filename, info.lineno, msg)
diff --git a/python/ark/model.py b/python/ark/model.py
index e6208fc16..bfd74d5e0 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,57 +2,60 @@
 # Licensed under the MIT license.
 
 from typing import NewType
-from _ark_core import _Model
+from .core import CoreModel
 
-_ModelState = NewType("_ModelState", None)
 
+__all__ = ["Model"]
 
-class Model(_Model):
+ModelState = NewType("ModelState", None)
+
+
+class Model(CoreModel):
     @staticmethod
     def get_model():
         """
         Get the underlying model.
         """
-        if _ModelState.model is None:
-            _ModelState.model = Model(_ModelState.rank, _ModelState.world_size)
-        return _ModelState.model
+        if ModelState.model is None:
+            ModelState.model = Model(ModelState.rank, ModelState.world_size)
+        return ModelState.model
 
     @staticmethod
     def get_rank():
         """
         Get the rank of the model.
         """
-        return _ModelState.rank
+        return ModelState.rank
 
     @staticmethod
     def get_world_size():
         """
         Get the world size of the model.
         """
-        return _ModelState.world_size
+        return ModelState.world_size
 
     @staticmethod
     def set_rank(rank: int):
         """
         Set the rank of the model.
         """
-        _ModelState.rank = rank
+        ModelState.rank = rank
 
     @staticmethod
     def set_world_size(world_size: int):
         """
         Set the world size of the model.
         """
-        _ModelState.world_size = world_size
+        ModelState.world_size = world_size
 
     @staticmethod
     def reset():
         """
         Reset the model state.
         """
-        _ModelState.model = None
-        _ModelState.rank = 0
-        _ModelState.world_size = 1
+        ModelState.model = None
+        ModelState.rank = 0
+        ModelState.world_size = 1
 
     def compress(self) -> "Model":
         """
@@ -73,9 +76,9 @@ def serialize(self, pretty: bool = True) -> str:
         return super().serialize(pretty)
 
 
-class _ModelState:
+class ModelState:
     """
-    The _ModelState class is used to store the state of the model.
+    The ModelState class is used to store the state of the model.
     """
 
     model: Model = None
diff --git a/python/ark/module.py b/python/ark/module.py
index 62b941281..368f36cf7 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -5,6 +5,9 @@
 import numpy as np
 from typing import Any, Dict
 from .tensor import Parameter
+from . import log
+
+__all__ = ["Module"]
 
 
 class Module:
@@ -63,7 +66,7 @@ def load_state_dict(
         Loads a model from a state_dict and copy the parameters to the device GPU.
         Must be called after the executor is launched.
         """
-        logging.info("Loading model from state_dict")
+        log.INFO("Loading model from state_dict")
 
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
@@ -71,9 +74,7 @@ def load_state_dict(
             param.from_numpy(state_dict[name])
             all_keys.remove(name)
         if all_keys:
-            logging.warning(
-                f"{len(all_keys)} unused parameter(s) in state_dict"
-            )
+            log.WARN(f"{len(all_keys)} unused parameter(s) in state_dict")
 
     def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]:
         """
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 484e248ca..514adcb3c 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -6,9 +6,43 @@
 from .tensor import Dims, Tensor, Parameter, NullTensor
 from .data_type import DataType, fp32
 from .model import Model
+from . import log
 
 
-def _is_list_or_tuple(obj):
+__all__ = [
+    "tensor",
+    "parameter",
+    "reshape",
+    "identity",
+    "sharding",
+    "reduce_sum",
+    "reduce_mean",
+    "reduce_max",
+    "layernorm",
+    "softmax",
+    "transpose",
+    "matmul",
+    "exp",
+    "sqrt",
+    "rsqrt",
+    "rope",
+    "relu",
+    "gelu",
+    "sigmoid",
+    "add",
+    "sub",
+    "mul",
+    "div",
+    "all_reduce",
+    "embedding",
+    "cast",
+    "constant",
+    "ones",
+    "zeros",
+]
+
+
+def is_list_or_tuple(obj):
     return isinstance(obj, list) or isinstance(obj, tuple)
 
 
@@ -21,14 +55,22 @@ def _tensor(
     rank: int = -1,
     name: str = "",
 ) -> Tensor:
-    if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
-    if not _is_list_or_tuple(strides):
-        raise ValueError("strides should be a list or tuple of integers")
-    if not _is_list_or_tuple(offsets):
-        raise ValueError("offsets should be a list or tuple of integers")
-    if not _is_list_or_tuple(padded_shape):
-        raise ValueError("padded_shape should be a list or tuple of integers")
+    if not is_list_or_tuple(shape):
+        raise log.InvalidUsageError(
+            "shape should be a list or tuple of integers"
+        )
+    if not is_list_or_tuple(strides):
+        raise log.InvalidUsageError(
+            "strides should be a list or tuple of integers"
+        )
+    if not is_list_or_tuple(offsets):
+        raise log.InvalidUsageError(
+            "offsets should be a list or tuple of integers"
+        )
+    if not is_list_or_tuple(padded_shape):
+        raise log.InvalidUsageError(
+            "padded_shape should be a list or tuple of integers"
+        )
     # only support tensors with up to 4 dimensions
     if (
         len(shape) > 4
@@ -36,7 +78,9 @@ def _tensor(
         or len(offsets) > 4
         or len(padded_shape) > 4
     ):
-        raise ValueError("Only support tensors with up to 4 dimensions")
+        raise log.InvalidUsageError(
+            "Only support tensors with up to 4 dimensions"
+        )
     return Model.get_model().tensor(
         Dims(shape),
         dtype.ctype(),
@@ -190,7 +234,7 @@ def identity(
     dep_tensors = []
     for dep in deps:
         if not isinstance(dep, Tensor):
-            raise TypeError("All dependencies should be a tensor")
+            raise log.InvalidUsageError("All dependencies should be a tensor")
         dep_tensors.append(dep._tensor)
     return Tensor(Model.get_model().identity(input._tensor, dep_tensors, name))
 
@@ -353,11 +397,15 @@ def reshape(
     # tensors shape is [128, 64]
     tensor = ark.reshape(tensor, [2, 64, 64])
     """
-    if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
+    if not is_list_or_tuple(shape):
+        raise log.InvalidUsageError(
+            "shape should be a list or tuple of integers"
+        )
     # only support tensors with up to 4 dimensions
     if len(shape) > 4:
-        raise ValueError("Only support tensors with up to 4 dimensions")
+        raise log.InvalidUsageError(
+            "Only support tensors with up to 4 dimensions"
+        )
     return Tensor(
         Model.get_model().reshape(input._tensor, Dims(shape), allowzero, name)
     )
@@ -495,11 +543,13 @@ def transpose(
     """
     if output is not NullTensor:
         output = output._tensor
-    if not _is_list_or_tuple(perm):
-        raise ValueError("perm should be a list or tuple of integers")
+    if not is_list_or_tuple(perm):
+        raise log.InvalidUsageError(
+            "perm should be a list or tuple of integers"
+        )
     # only support tensors with up to 4 dimensions
     if len(perm) > 4:
-        raise ValueError("Only support perm up to 4 dimensions")
+        raise log.InvalidUsageError("Only support perm up to 4 dimensions")
     return Tensor(
         Model.get_model().transpose(input._tensor, perm, output, name)
     )
@@ -612,39 +662,6 @@ def all_reduce(
     return Tensor(_tensor)
 
 
-__all__ = [
-    "tensor",
-    "parameter",
-    "reshape",
-    "identity",
-    "sharding",
-    "reduce_sum",
-    "reduce_mean",
-    "reduce_max",
-    "layernorm",
-    "softmax",
-    "transpose",
-    "matmul",
-    "exp",
-    "sqrt",
-    "rsqrt",
-    "rope",
-    "relu",
-    "gelu",
-    "sigmoid",
-    "add",
-    "sub",
-    "mul",
-    "div",
-    "all_reduce",
-    "embedding",
-    "cast",
-    "constant",
-    "ones",
-    "zeros",
-]
-
-
 # def im2col(
 #     input: Tensor,
 #     kernel_height: int,
diff --git a/python/ark/planner.py b/python/ark/planner.py
index e7eb2e7ed..3c82719be 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -5,10 +5,13 @@
 import json
 from typing import Callable, Dict, List, Any
 
-from _ark_core import _Planner, _PlannerContext
+from .core import CorePlanner, CorePlannerContext
 from .model import Model
 
 
+__all__ = ["Plan", "PlannerContext", "Planner"]
+
+
 def idnt(indent):
     return " " * indent
 
@@ -162,7 +165,7 @@ def from_file(file_path: str) -> "Plan":
         return Plan(plan)
 
 
-class PlannerContext(_PlannerContext):
+class PlannerContext(CorePlannerContext):
     def __init__(self, **kwargs):
         """
         Plan manager for specifying the parallelization and tiling configuration of the operators in the context.
@@ -205,7 +208,7 @@ def __exit__(self, exc_type, exc_value, exc_tb):
         del self
 
 
-class Planner(_Planner):
+class Planner(CorePlanner):
     def __init__(self, device_id: int = 0):
         compressed = Model.get_model().compress()
         super().__init__(compressed, device_id)
@@ -225,6 +228,3 @@ def plan(self) -> Plan:
         Generate an execution plan.
         """
         return Plan.from_str(super().plan(pretty=False))
-
-
-__all__ = ["Plan", "PlannerContext", "Planner"]
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 495fc1c24..017350103 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -4,20 +4,23 @@
 import logging
 from enum import Enum
 
-from _ark_core import _Executor
+from .core import CoreExecutor
 from .planner import Planner, Plan
 
 
-class _RuntimeState:
+__all__ = ["Executor", "Runtime"]
+
+
+class RuntimeState:
     """
-    The _RuntimeState class is used to store the state of the model.
+    The RuntimeState class is used to store the state of the model.
     """
 
     runtime = None
     executor = None
 
 
-class Executor(_Executor):
+class Executor(CoreExecutor):
     pass
 
 
@@ -40,14 +43,14 @@ def get_runtime() -> "Runtime":
         """
         Get the runtime.
         """
-        if _RuntimeState.runtime is None:
-            _RuntimeState.runtime = Runtime()
-        return _RuntimeState.runtime
+        if RuntimeState.runtime is None:
+            RuntimeState.runtime = Runtime()
+        return RuntimeState.runtime
 
     def __init__(self):
         self.executor: Executor = None
         self.state: Runtime.State = Runtime.State.Init
-        _RuntimeState.runtime = self
+        RuntimeState.runtime = self
 
     def __del__(self):
         self.reset()
@@ -92,19 +95,19 @@ def launch(
         # If the RuntimeState is init, we need to create a new executor and
         # compile the kernels
         if self.state == Runtime.State.Init:
-            if _RuntimeState.executor is not None:
-                if not _RuntimeState.executor.destroyed():
+            if RuntimeState.executor is not None:
+                if not RuntimeState.executor.destroyed():
                     logging.warn("Destroying an old executor")
-                    _RuntimeState.executor.destroy()
+                    RuntimeState.executor.destroy()
 
-            _RuntimeState.executor = Executor(
+            RuntimeState.executor = Executor(
                 device_id,
                 stream,
                 "ArkRuntime",
                 str(plan),
                 loop_mode,
             )
-            self.executor = _RuntimeState.executor
+            self.executor = RuntimeState.executor
             self.executor.compile()
         self.executor.launch()
         self.state = Runtime.State.LaunchedNotRunning
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index d69f2aabc..197d92921 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,23 +4,23 @@
 import numpy as np
 from typing import List
 
-from _ark_core import _Dims, _Tensor, _NullTensor
+from .core import CoreDims, CoreTensor, NullTensor
 from .data_type import DataType
 from .runtime import Runtime
 
-NullTensor = _NullTensor
+__all__ = ["Dims", "Tensor", "Parameter", "NullTensor"]
 
 
-class Dims(_Dims):
+class Dims(CoreDims):
     pass
 
 
 class Tensor:
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: CoreTensor):
         """
         Initializes a new instance of the Tensor class.
         Args:
-            _tensor (_ark_core._Tensor): The underlying _Tensor object.
+            _tensor (core.CoreTensor): The underlying CoreTensor object.
         """
         self._tensor = _tensor
 
@@ -97,7 +97,7 @@ class Parameter(Tensor):
     A tensor as a parameter.
     """
 
-    def __init__(self, _tensor: _Tensor):
+    def __init__(self, _tensor: CoreTensor):
         """
         Initializes a new instance of the Parameter class.
         """
diff --git a/python/ark_py.cpp b/python/ark_py.cpp
index 1bc4255d6..294c9d25d 100644
--- a/python/ark_py.cpp
+++ b/python/ark_py.cpp
@@ -12,6 +12,7 @@ extern void register_dims(py::module &m);
 extern void register_error(py::module &m);
 extern void register_executor(py::module &m);
 extern void register_init(py::module &m);
+extern void register_log(py::module &m);
 extern void register_model_graph(py::module &m);
 extern void register_model(py::module &m);
 extern void register_planner(py::module &m);
@@ -19,7 +20,7 @@ extern void register_random(py::module &m);
 extern void register_tensor(py::module &m);
 extern void register_version(py::module &m);
 
-PYBIND11_MODULE(_ark_core, m) {
+PYBIND11_MODULE(core, m) {
     m.doc() = "Bind ARK C++ APIs to Python";
 
     register_data_type(m);
@@ -27,6 +28,7 @@ PYBIND11_MODULE(_ark_core, m) {
     register_error(m);
     register_executor(m);
     register_init(m);
+    register_log(m);
     register_model_graph(m);
     register_model(m);
     register_planner(m);
diff --git a/python/data_type_py.cpp b/python/data_type_py.cpp
index 21a21d694..dcfffb2b4 100644
--- a/python/data_type_py.cpp
+++ b/python/data_type_py.cpp
@@ -10,7 +10,7 @@
 namespace py = pybind11;
 
 void register_data_type(py::module &m) {
-    py::class_<ark::DataType>(m, "_DataType")
+    py::class_<ark::DataType>(m, "CoreDataType")
         .def("__eq__", &ark::DataType::operator==)
         .def("__ne__", &ark::DataType::operator!=)
         .def("is_null", &ark::DataType::is_null)
diff --git a/python/dims_py.cpp b/python/dims_py.cpp
index 78e732a9c..31e9e0b21 100644
--- a/python/dims_py.cpp
+++ b/python/dims_py.cpp
@@ -13,7 +13,7 @@ namespace py = pybind11;
 void register_dims(py::module &m) {
     m.attr("DIMS_LEN") = py::int_(ark::DIMS_LEN);
 
-    py::class_<ark::Dims>(m, "_Dims")
+    py::class_<ark::Dims>(m, "CoreDims")
         .def(py::init<>())
         .def(py::init<ark::DimType>())
         .def(py::init<ark::DimType, ark::DimType>())
diff --git a/python/error_py.cpp b/python/error_py.cpp
index b42f79773..ce6e7cc53 100644
--- a/python/error_py.cpp
+++ b/python/error_py.cpp
@@ -9,12 +9,11 @@
 
 namespace py = pybind11;
 
-#define REGISTER_ERROR_PY(_name)                      \
-    py::register_exception<ark::_name>(m, "_" #_name, \
-                                       m.attr("_BaseError").ptr())
+#define REGISTER_ERROR_PY(_name) \
+    py::register_exception<ark::_name>(m, #_name, m.attr("BaseError").ptr())
 
 void register_error(py::module &m) {
-    py::register_exception<ark::BaseError>(m, "_BaseError");
+    py::register_exception<ark::BaseError>(m, "BaseError");
 
     REGISTER_ERROR_PY(InternalError);
     REGISTER_ERROR_PY(InvalidUsageError);
diff --git a/python/executor_py.cpp b/python/executor_py.cpp
index b1e468608..a2195f106 100644
--- a/python/executor_py.cpp
+++ b/python/executor_py.cpp
@@ -41,7 +41,7 @@ static void tensor_read(ark::Executor *exe, const ark::Tensor &tensor,
 }
 
 void register_executor(py::module &m) {
-    py::class_<ark::Executor>(m, "_Executor")
+    py::class_<ark::Executor>(m, "CoreExecutor")
         .def(py::init([](int device_id, uintptr_t stream,
                          const std::string &name, const std::string &plan,
                          bool loop_mode) {
diff --git a/python/init_py.cpp b/python/init_py.cpp
index ebae7d42a..f7df9e503 100644
--- a/python/init_py.cpp
+++ b/python/init_py.cpp
@@ -6,7 +6,6 @@
 #include <pybind11/stl.h>
 
 #include <ark/init.hpp>
-#include <sstream>
 
 namespace py = pybind11;
 
diff --git a/python/log_py.cpp b/python/log_py.cpp
new file mode 100644
index 000000000..940094191
--- /dev/null
+++ b/python/log_py.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT license.
+
+#include <pybind11/operators.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <ark/log.hpp>
+
+namespace py = pybind11;
+
+void register_log(py::module &m) {
+    py::enum_<ark::LogLevel>(m, "LogLevel")
+        .value("DEBUG", ark::LogLevel::DEBUG)
+        .value("INFO", ark::LogLevel::INFO)
+        .value("WARN", ark::LogLevel::WARN)
+        .value("ERROR", ark::LogLevel::ERROR)
+        .export_values();
+    m.def("log", &ark::log);
+}
diff --git a/python/model_graph_py.cpp b/python/model_graph_py.cpp
index 0fd806b7c..b8ac78826 100644
--- a/python/model_graph_py.cpp
+++ b/python/model_graph_py.cpp
@@ -10,7 +10,7 @@
 namespace py = pybind11;
 
 void register_model_graph(py::module &m) {
-    py::class_<ark::ModelGraph>(m, "_ModelGraph")
+    py::class_<ark::ModelGraph>(m, "CoreModelGraph")
         .def("serialize", &ark::ModelGraph::serialize,
              py::arg("pretty") = true);
 }
diff --git a/python/model_py.cpp b/python/model_py.cpp
index c224a3d5b..b9e7ec54f 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -11,7 +11,7 @@
 namespace py = pybind11;
 
 void register_model(py::module &m) {
-    py::class_<ark::Model, ark::ModelGraph>(m, "_Model")
+    py::class_<ark::Model, ark::ModelGraph>(m, "CoreModel")
         .def(py::init<int, int>(), py::arg("rank"), py::arg("world_size"))
         .def("rank", &ark::Model::rank)
         .def("world_size", &ark::Model::world_size)
diff --git a/python/planner_py.cpp b/python/planner_py.cpp
index f3462b1c9..f0af0fa35 100644
--- a/python/planner_py.cpp
+++ b/python/planner_py.cpp
@@ -11,18 +11,18 @@
 namespace py = pybind11;
 
 void register_planner(py::module &m) {
-    py::class_<ark::PlannerContext>(m, "_PlannerContext")
+    py::class_<ark::PlannerContext>(m, "CorePlannerContext")
         .def(py::init<ark::Model &>())
         .def("processor_range", &ark::PlannerContext::processor_range,
              py::arg("start"), py::arg("end"), py::arg("step") = 1)
         .def("warp_range", &ark::PlannerContext::warp_range, py::arg("start"),
-            py::arg("end"), py::arg("step") = 1)
+             py::arg("end"), py::arg("step") = 1)
         .def("sram_range", &ark::PlannerContext::sram_range, py::arg("start"),
-            py::arg("end"), py::arg("step") = 1)
+             py::arg("end"), py::arg("step") = 1)
         .def("sync", &ark::PlannerContext::sync, py::arg("sync"))
         .def("config", &ark::PlannerContext::config, py::arg("config"));
 
-    py::class_<ark::Planner>(m, "_Planner")
+    py::class_<ark::Planner>(m, "CorePlanner")
         .def(py::init<const ark::Model &, int>())
         .def("install_config_rule",
              [](ark::Planner *self, const py::function &rule) {
diff --git a/python/tensor_py.cpp b/python/tensor_py.cpp
index fbd909d3d..e85352f53 100644
--- a/python/tensor_py.cpp
+++ b/python/tensor_py.cpp
@@ -10,7 +10,7 @@
 namespace py = pybind11;
 
 void register_tensor(py::module &m) {
-    py::class_<ark::Tensor>(m, "_Tensor")
+    py::class_<ark::Tensor>(m, "CoreTensor")
         .def("id", &ark::Tensor::id)
         .def("shape", &ark::Tensor::shape, py::return_value_policy::reference)
         .def("strides", &ark::Tensor::strides,
@@ -22,5 +22,5 @@ void register_tensor(py::module &m) {
         .def("data_type", &ark::Tensor::data_type,
              py::return_value_policy::reference);
 
-    m.attr("_NullTensor") = &ark::NullTensor;
+    m.attr("NullTensor") = &ark::NullTensor;
 }
diff --git a/python/unittest/common.py b/python/unittest/common.py
new file mode 100644
index 000000000..0c385e89a
--- /dev/null
+++ b/python/unittest/common.py
@@ -0,0 +1,28 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytest
+import ark
+
+
+def pytest_ark(need_torch: bool = False):
+    """
+    Decorator for ARK unit tests.
+    """
+
+    def decorator(test_func):
+        if need_torch:
+            try:
+                import torch
+            except ImportError:
+                return pytest.mark.skip(reason="torch is not installed")(
+                    test_func
+                )
+
+        def wrapper(*args, **kwargs):
+            ark.init()
+            test_func(*args, **kwargs)
+
+        return wrapper
+
+    return decorator
diff --git a/python/unittest/test.py b/python/unittest/test.py
index 2d9647e3a..fe2114c71 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -1,12 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import sys
-import os
-
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)) + "/..")
-sys.path.insert(0, os.environ.get("ARK_ROOT", ".") + "/python")
-
+from test_data_type import *
 from test_error import *
 from test_model import *
 from test_runtime import *
diff --git a/python/unittest/test_data_type.py b/python/unittest/test_data_type.py
new file mode 100644
index 000000000..a37b68a90
--- /dev/null
+++ b/python/unittest/test_data_type.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from common import ark, pytest_ark
+import pytest
+import numpy as np
+
+
+@pytest_ark()
+def test_data_type_from_numpy():
+    assert ark.DataType.from_numpy(np.dtype(np.float32)) == ark.fp32
+    assert ark.DataType.from_numpy(np.dtype(np.float16)) == ark.fp16
+    assert ark.DataType.from_numpy(np.dtype(np.int32)) == ark.int32
+    assert ark.DataType.from_numpy(np.dtype(np.uint32)) == ark.uint32
+    assert ark.DataType.from_numpy(np.dtype(np.int8)) == ark.int8
+    assert ark.DataType.from_numpy(np.dtype(np.uint8)) == ark.uint8
+
+    with pytest.raises(ark.error.InvalidUsageError):
+        ark.DataType.from_numpy(None)
+
+
+@pytest_ark()
+def test_data_type_from_name():
+    assert ark.DataType.from_name("fp32") == ark.fp32
+    assert ark.DataType.from_name("fp16") == ark.fp16
+    assert ark.DataType.from_name("int32") == ark.int32
+    assert ark.DataType.from_name("uint32") == ark.uint32
+    assert ark.DataType.from_name("int8") == ark.int8
+    assert ark.DataType.from_name("uint8") == ark.uint8
+
+    with pytest.raises(ark.error.InvalidUsageError):
+        ark.DataType.from_name("unknown")
+
+
+@pytest_ark()
+def test_data_type_from_ctype():
+    assert ark.DataType.from_ctype(ark.core.FP32) == ark.fp32
+    assert ark.DataType.from_ctype(ark.core.FP16) == ark.fp16
+    assert ark.DataType.from_ctype(ark.core.INT32) == ark.int32
+    assert ark.DataType.from_ctype(ark.core.UINT32) == ark.uint32
+    assert ark.DataType.from_ctype(ark.core.INT8) == ark.int8
+    assert ark.DataType.from_ctype(ark.core.UINT8) == ark.uint8
+
+    with pytest.raises(ark.error.InvalidUsageError):
+        ark.DataType.from_ctype(None)
+
+
+@pytest_ark()
+def test_data_type_to_numpy():
+    assert ark.fp32.to_numpy() == np.float32
+    assert ark.fp16.to_numpy() == np.float16
+    assert ark.int32.to_numpy() == np.int32
+    assert ark.uint32.to_numpy() == np.uint32
+    assert ark.int8.to_numpy() == np.int8
+    assert ark.uint8.to_numpy() == np.uint8
+
+
+@pytest_ark()
+def test_data_type_ctype():
+    assert ark.fp32.ctype() == ark.core.FP32
+    assert ark.fp16.ctype() == ark.core.FP16
+    assert ark.int32.ctype() == ark.core.INT32
+    assert ark.uint32.ctype() == ark.core.UINT32
+    assert ark.int8.ctype() == ark.core.INT8
+    assert ark.uint8.ctype() == ark.core.UINT8
+
+
+@pytest_ark()
+def test_data_type_element_size():
+    assert ark.fp32.element_size() == 4
+    assert ark.fp16.element_size() == 2
+    assert ark.int32.element_size() == 4
+    assert ark.uint32.element_size() == 4
+    assert ark.int8.element_size() == 1
+    assert ark.uint8.element_size() == 1
diff --git a/python/unittest/test_error.py b/python/unittest/test_error.py
index 299e2675e..d5d1b5fe2 100644
--- a/python/unittest/test_error.py
+++ b/python/unittest/test_error.py
@@ -1,12 +1,13 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from common import ark, pytest_ark
 
 
+@pytest_ark()
 def test_error():
-    ark.init()
     try:
-        ark.tensor([0])
+        raise ark.InternalError("test")
     except ark.BaseError as e:
-        assert isinstance(e, ark.ModelError)
+        assert isinstance(e, ark.InternalError)
+        assert str(e) == "test"
diff --git a/python/unittest/test_model.py b/python/unittest/test_model.py
index da8ae399a..ad40d7524 100644
--- a/python/unittest/test_model.py
+++ b/python/unittest/test_model.py
@@ -1,13 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
+from common import ark, pytest_ark
 import json
 
 
+@pytest_ark()
 def test_model():
-    ark.init()
-
     input_tensor = ark.tensor([64, 64], ark.fp16)
     other_tensor = ark.tensor([64, 64], ark.fp16)
     ark.add(input_tensor, other_tensor)
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index d91fd85c5..10a72c082 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -1,18 +1,4 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import ark
-
-
-def test_runtime_relaunch():
-    ark.init()
-
-    with ark.Runtime.get_runtime() as rt:
-        assert rt.launched() == False
-        rt.launch()
-        assert rt.launched() == True
-
-    with ark.Runtime.get_runtime() as rt:
-        assert rt.launched() == False
-        rt.launch()
-        assert rt.launched() == True
+from common import ark, pytest_ark

From 9919d69eb2f51e3fb7c5b82afb45949274af9980 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 07:37:08 +0000
Subject: [PATCH 078/106] revert

---
 arkprof.py                                  |    7 -
 examples/ffn/main.py                        |   73 --
 examples/llama/model_7b_b1_s2048.py         |  714 -----------
 examples/llama/plan_llama2_7b_b1_s2048.json | 1206 -------------------
 examples/tutorial/model_test_tutorial.py    |  164 ---
 python/ark/__init__.py                      |    2 +-
 python/ark/module.py                        |   13 -
 python/unittest/test_conversion.py          |  281 -----
 python/unittest/test_torch_tracer.py        |   25 -
 python/unittest/unittest_common.py          |   28 -
 10 files changed, 1 insertion(+), 2512 deletions(-)
 delete mode 100644 arkprof.py
 delete mode 100644 examples/ffn/main.py
 delete mode 100644 examples/llama/model_7b_b1_s2048.py
 delete mode 100644 examples/llama/plan_llama2_7b_b1_s2048.json
 delete mode 100644 examples/tutorial/model_test_tutorial.py
 delete mode 100644 python/unittest/test_conversion.py
 delete mode 100644 python/unittest/test_torch_tracer.py
 delete mode 100644 python/unittest/unittest_common.py

diff --git a/arkprof.py b/arkprof.py
deleted file mode 100644
index 5fb62e118..000000000
--- a/arkprof.py
+++ /dev/null
@@ -1,7 +0,0 @@
-import ark
-import sys
-
-ark.init()
-ark.Profiler(ark.Plan.from_file(sys.argv[1])).run(
-    iter=1000, profile_processor_groups=False
-)
diff --git a/examples/ffn/main.py b/examples/ffn/main.py
deleted file mode 100644
index 263228d95..000000000
--- a/examples/ffn/main.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import torch
-import torch.nn as nn
-from ark.torch.tracer import tracer as ark_torch_tracer
-
-
-class FeedForward(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int):
-        super().__init__()
-        self.w1 = nn.Linear(dim, hidden_dim, bias=False)
-        self.w2 = nn.Linear(hidden_dim, dim, bias=True)
-        self.w3 = nn.Linear(dim, hidden_dim, bias=False)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        return self.w2(nn.functional.silu(self.w1(x)) * self.w3(x))
-
-
-@ark_torch_tracer
-class ForwardPass(nn.Module):
-    def __init__(self, dim: int, hidden_dim: int):
-        super().__init__()
-        self.ff = FeedForward(dim, hidden_dim)
-
-    def forward(
-        self, input: torch.Tensor, target: torch.Tensor
-    ) -> torch.Tensor:
-        t = self.ff(input)
-        return nn.functional.mse_loss(t, target)
-
-
-def main():
-    batch_size = 128
-    num_batches = 1
-    dim = 1024
-    hidden_dim = 4096
-    num_epochs = 10
-    torch.manual_seed(42)
-    torch.set_default_device("cuda:0")
-
-    model = ForwardPass(dim=dim, hidden_dim=hidden_dim)
-
-    inputs = [torch.randn(batch_size, dim) for _ in range(num_batches)]
-    targets = [torch.randn(batch_size, dim) for _ in range(num_batches)]
-
-    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-
-    for epoch in range(1):
-        optimizer.zero_grad()
-        avg_loss = 0
-        for input, target in zip(inputs, targets):
-            loss = model(input, target)
-            avg_loss += loss.detach().item()
-            loss.backward()
-            optimizer.step()
-        avg_loss /= num_batches
-        print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")
-
-    for epoch in range(num_epochs):
-        optimizer.zero_grad()
-        avg_loss = 0
-        for input, target in zip(inputs, targets):
-            loss = model.forward_ark(input, target)
-            avg_loss += loss.to_numpy()[0]
-            model.backward_ark(loss)
-            optimizer.step()
-        avg_loss /= num_batches
-        print(f"Epoch {epoch}, Loss: {avg_loss:.6f}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/llama/model_7b_b1_s2048.py b/examples/llama/model_7b_b1_s2048.py
deleted file mode 100644
index 73d349ccc..000000000
--- a/examples/llama/model_7b_b1_s2048.py
+++ /dev/null
@@ -1,714 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-"""LLaMA 2 Transformer model.
-   Correspond to https://github.com/facebookresearch/llama/blob/main/llama/model.py
-"""
-
-import ark
-import math
-from dataclasses import dataclass
-from typing import Optional
-import os
-
-
-@dataclass
-class ModelArgs:
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = (
-        256  # make SwiGLU hidden layer size multiple of large power of 2
-    )
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-@dataclass
-class ModelArgs7B(ModelArgs):
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = (
-        256  # make SwiGLU hidden layer size multiple of large power of 2
-    )
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-@dataclass
-class ModelArgs13B(ModelArgs):
-    dim: int = 5120
-    n_layers: int = 40
-    n_heads: int = 40
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = (
-        256  # make SwiGLU hidden layer size multiple of large power of 2
-    )
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-@dataclass
-class ModelArgs70B(ModelArgs):
-    dim: int = 8192
-    n_layers: int = 80
-    n_heads: int = 64
-    n_kv_heads: Optional[int] = 8
-    vocab_size: int = -1
-    multiple_of: int = (
-        4096  # make SwiGLU hidden layer size multiple of large power of 2
-    )
-    ffn_dim_multiplier: Optional[float] = 1.3
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 4096
-
-
-class RMSNorm(ark.Module):
-    """
-    Root mean square layer normalization (RMSNorm).
-    """
-
-    def __init__(
-        self, dim: int, eps: float = 1e-6, dtype: ark.DataType = ark.fp16
-    ):
-        super().__init__()
-        self.eps = eps
-        self.dtype = dtype
-        self.weight = ark.parameter([1, 1, dim], ark.fp32)
-
-    def forward(self, x):
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "NumTasks": 2048,
-                "Granularity": 7,
-            },
-        ):
-            with ark.PlannerContext(config={"Tile": [1, 4096]}):
-                x = ark.cast(x, ark.fp32)
-                x2 = ark.mul(x, x)
-            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
-                mean = ark.reduce_mean(x2, axis=-1)
-        with ark.PlannerContext(
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "Tile": [64, 1],
-                "NumTasks": 32,
-            }
-        ):
-            rrms = ark.rsqrt(mean)
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "NumTasks": 2048,
-                "Tile": [1, 4096],
-                "Granularity": 7,
-            },
-        ):
-            x = ark.mul(x, rrms)
-            x = ark.mul(x, self.weight, x)
-            return ark.cast(x, self.dtype)
-
-
-class ColumnParallelLinear(ark.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-    Here the weight = A^T, so we need to partition the weight matrix along
-    its first dimension.
-
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        dtype: ark.DataType = ark.fp16,
-        gather_output: bool = True,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.dtype = dtype
-        self.local_rank = local_rank
-        self.world_size = world_size
-        self.gather_output = gather_output
-
-        self.weight = ark.parameter([out_dim // world_size, in_dim], dtype)
-
-    def forward(self, x):
-        if self.world_size == 1 or self.gather_output == False:
-            return ark.matmul(x, self.weight, transpose_other=True)
-        # We need to concat the output_tensor_shards along the last dimension
-        output_tensor = ark.tensor(
-            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
-        )
-        output_tensor_shards = ark.sharding(
-            output_tensor,
-            axis=2,
-            dim_per_shard=self.out_dim // self.world_size,
-        )
-        local_result = ark.identity(
-            output_tensor_shards[self.local_rank], deps=output_tensor_shards
-        )
-        # (batch_size, seq_len, out_dim // world_size)
-        local_result = ark.matmul(
-            x, self.weight, local_result, transpose_other=True
-        )
-        gather_input = ark.identity(output_tensor, deps=[local_result])
-        # return gather_input
-        gather_reshape = ark.reshape(
-            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
-        )
-        gather_out = ark.local_all_gather(
-            gather_reshape, self.local_rank, self.world_size, 1
-        )
-        return ark.reshape(
-            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
-        )
-
-
-class RowParallelLinear(ark.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-
-    Here the weight = A^T, so we need to partition the weight matrix along
-    its second dimension.
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        dtype: ark.DataType = ark.fp16,
-        input_is_parallel: bool = False,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.dtype = dtype
-        self.local_rank = local_rank
-        self.world_size = world_size
-        self.input_is_parallel = input_is_parallel
-
-        self.weight = ark.parameter([out_dim, in_dim // world_size], dtype)
-
-    def forward(self, x):
-        if self.world_size == 1:
-            return ark.matmul(x, self.weight, transpose_other=True)
-        x_ndims = len(x.shape())
-        if self.input_is_parallel:
-            input_parallel = x
-        else:
-            x_shards = ark.sharding(
-                x, x_ndims - 1, self.in_dim // self.world_size
-            )
-            input_parallel = x_shards[self.local_rank]
-        local_result = ark.matmul(
-            input_parallel, self.weight, transpose_other=True
-        )
-        reduced_result = ark.local_all_reduce(
-            local_result, self.local_rank, self.world_size
-        )
-        return reduced_result
-
-
-class ParallelEmbedding(ark.Module):
-    """Embedding layer."""
-
-    def __init__(
-        self,
-        vocab_size: int,
-        dim: int,
-        dtype: ark.DataType,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.vocab_size = vocab_size
-        self.dim = dim
-        self.weight = ark.parameter([vocab_size, dim // world_size], dtype)
-        self.out_dim = dim
-        self.dtype = dtype
-        self.world_size = world_size
-        self.local_rank = local_rank
-
-    def forward(self, x):
-        if self.world_size == 1:
-            return ark.embedding(x, self.weight)
-
-        output_tensor = ark.tensor(
-            [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
-        )
-        output_tensor_shards = ark.sharding(
-            output_tensor, axis=2, dim_per_shard=self.out_dim // self.world_size
-        )
-        local_result = ark.identity(
-            output_tensor_shards[self.local_rank], deps=output_tensor_shards
-        )
-        local_result = ark.embedding(x, self.weight, local_result)
-        gather_input = ark.identity(output_tensor, deps=[local_result])
-        gather_reshape = ark.reshape(
-            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
-        )
-        gather_out = ark.local_all_gather(
-            gather_reshape, self.local_rank, self.world_size, 1
-        )
-        return ark.reshape(
-            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
-        )
-
-
-class Linear(ark.Module):
-    """
-    Linear layer module with weights and no bias.
-    """
-
-    def __init__(
-        self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.weight = ark.parameter([out_dim, in_dim], dtype)
-
-    def forward(self, x):
-        return ark.matmul(x, self.weight, transpose_other=True)
-
-
-class Silu(ark.Module):
-    """
-    Silu activation function, silu(x) = x * sigmoid(x)
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: ark.Tensor):
-        # We need to specify output tensor so that the sigmoid op will not be an in-place operator
-        output = ark.tensor(x.shape(), x.dtype())
-        x1 = ark.sigmoid(x, output)
-        return ark.mul(x, x1)
-
-
-class FeedForward(ark.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-        dtype: ark.DataType = ark.fp16,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-
-        self.w1 = ColumnParallelLinear(
-            dim, hidden_dim, dtype, False, local_rank, world_size
-        )
-        self.w2 = RowParallelLinear(
-            hidden_dim, dim, dtype, True, local_rank, world_size
-        )
-        self.w3 = ColumnParallelLinear(
-            dim, hidden_dim, dtype, False, local_rank, world_size
-        )
-
-    def forward(self, x):
-        # self.w2(F.silu(self.w1(x)) * self.w3(x))
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sram_range=[0, 49344],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "NumTasks": 688,
-            },
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                x1 = self.w1(x)
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 128]}
-            ):
-                x1 = Silu()(x1)
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sram_range=[0, 49344],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "NumTasks": 688,
-            },
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                x2 = self.w3(x)
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 128]}
-            ):
-                x3 = ark.mul(x1, x2)
-        x4 = self.w2(x3)
-        return x4
-
-
-def apply_rotary_emb(xq, xk, freqs_cis):
-    """
-    Apply rotary embeddings to xq and xk.
-    """
-    xq_out = ark.rope(xq, freqs_cis)
-    xk_out = ark.rope(xk, freqs_cis)
-    return xq_out, xk_out
-
-
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "NumTasks": 65536,
-            },
-        ):
-            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
-                max = ark.reduce_max(input, axis=-1)
-            with ark.PlannerContext(config={"Tile": [1, 2048]}):
-                output = ark.sub(input, max)
-                output = ark.exp(output)
-            with ark.PlannerContext(config={"ImplType": "WarpWise"}):
-                sum = ark.reduce_sum(output, axis=-1)
-            with ark.PlannerContext(config={"Tile": [1, 2048]}):
-                output = ark.div(output, sum)
-            return output
-
-
-class Attention(ark.Module):
-    def __init__(
-        self,
-        args: ModelArgs,
-        dtype: ark.DataType = ark.fp16,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.n_kv_heads = (
-            args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        )
-        model_parallel_size = world_size
-        self.dtype = dtype
-        self.n_local_heads = args.n_heads // model_parallel_size
-        self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
-        self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
-        self.wq = ColumnParallelLinear(
-            args.dim,
-            args.n_heads * self.head_dim,
-            dtype,
-            False,
-            local_rank,
-            world_size,
-        )
-        self.wk = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            dtype,
-            False,
-            local_rank,
-            world_size,
-        )
-        self.wv = ColumnParallelLinear(
-            args.dim,
-            self.n_kv_heads * self.head_dim,
-            dtype,
-            False,
-            local_rank,
-            world_size,
-        )
-        self.wo = RowParallelLinear(
-            args.n_heads * self.head_dim,
-            args.dim,
-            dtype,
-            True,
-            local_rank,
-            world_size,
-        )
-
-    def forward(
-        self,
-        x: ark.Tensor,
-        start_pos: int,
-        freqs_cis: ark.Tensor,
-        mask: Optional[ark.Tensor],
-    ):
-        bsz, seqlen, _ = x.shape()
-
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={"NumWarps": 4, "NumTasks": 256},
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                xq = self.wq(x)
-            xq = ark.reshape(
-                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
-            )
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 1, 128]}
-            ):
-                if freqs_cis is not None:
-                    xq = ark.rope(xq, freqs_cis)
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 128]}
-            ):
-                xq = ark.transpose(xq, [0, 2, 1, 3])
-
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={"NumWarps": 4, "NumTasks": 256},
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                xk = self.wk(x)
-            xk = ark.reshape(
-                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-            )
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 1, 128]}
-            ):
-                if freqs_cis is not None:
-                    xk = ark.rope(xk, freqs_cis)
-            keys = xk
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 128]}
-            ):
-                keys = ark.transpose(keys, [0, 2, 1, 3])
-
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "NumTasks": 256,
-                "SramBytes": 24672,
-                "TileShapeMNK": [256, 128, 32],
-            },
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                xv = self.wv(x)
-            xv = ark.reshape(
-                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-            )
-            values = xv
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 1, 128]}
-            ):
-                values = ark.transpose(values, [0, 2, 1, 3])
-
-        with ark.PlannerContext(
-            warp_range=[0, 8],
-            sram_range=[0, 49344],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "NumTasks": 4096,
-                "Granularity": 2,
-            },
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                scores = ark.matmul(xq, keys, transpose_other=True)
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 128]}
-            ):
-                scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
-
-        if mask is not None:
-            scores = ark.add(scores, mask)
-
-        scores = Softmax()(scores)
-
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "NumTasks": 256,
-            },
-        ):
-            with ark.PlannerContext(
-                config={"SramBytes": 24672, "TileShapeMNK": [256, 128, 32]}
-            ):
-                output = ark.matmul(scores, values)
-            with ark.PlannerContext(
-                config={"SramBytes": 0, "Tile": [256, 1, 128]}
-            ):
-                output = ark.transpose(output, [0, 2, 1, 3])
-        output = ark.reshape(
-            output, [bsz, seqlen, self.head_dim * self.n_local_heads]
-        )
-        return self.wo(output)
-
-
-class TransformerBlock(ark.Module):
-    def __init__(
-        self,
-        layer_id: int,
-        args: ModelArgs,
-        dtype: ark.DataType = ark.fp16,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.n_heads = args.n_heads
-        self.dim = args.dim
-        self.head_dim = args.dim // args.n_heads
-        self.attention = Attention(args, dtype, local_rank, world_size)
-        self.feed_forward = FeedForward(
-            dim=args.dim,
-            hidden_dim=4 * args.dim,
-            multiple_of=args.multiple_of,
-            ffn_dim_multiplier=args.ffn_dim_multiplier,
-            dtype=dtype,
-            local_rank=local_rank,
-            world_size=world_size,
-        )
-        self.layer_id = layer_id
-        self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
-        self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps, dtype=dtype)
-
-    def forward(
-        self,
-        x: ark.Tensor,
-        start_pos: int,
-        freqs_cis: ark.Tensor,
-        mask: Optional[ark.Tensor],
-    ):
-        attention_norm_x = self.attention_norm(x)
-        h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            config={
-                "NumWarps": 4,
-                "Tile": [256, 128],
-                "NumTasks": 256,
-                "SramBytes": 0,
-            },
-        ):
-            h = ark.add(x, h)
-        ff = self.feed_forward(self.ffn_norm(h))
-        with ark.PlannerContext(
-            warp_range=[0, 4],
-            config={
-                "NumWarps": 4,
-                "Tile": [256, 128],
-                "NumTasks": 256,
-                "SramBytes": 0,
-            },
-        ):
-            out = ark.add(h, ff)
-        return out
-
-
-class Transformer(ark.Module):
-    def __init__(
-        self,
-        params: ModelArgs,
-        dtype: ark.DataType = ark.fp16,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.params = params
-        self.vocab_size = params.vocab_size
-        self.n_layers = params.n_layers
-
-        self.tok_embeddings = ParallelEmbedding(
-            params.vocab_size, params.dim, dtype, local_rank, world_size
-        )
-
-        self.layers = []
-        for layer_id in range(self.n_layers):
-            self.layers.append(
-                TransformerBlock(
-                    layer_id, params, dtype, local_rank, world_size
-                )
-            )
-            self.register_module(f"layers.{layer_id}", self.layers[layer_id])
-        self.norm = RMSNorm(params.dim, eps=params.norm_eps, dtype=dtype)
-        self.output = ColumnParallelLinear(
-            params.dim, params.vocab_size, dtype, True, local_rank, world_size
-        )
-
-    def forward(
-        self,
-        tokens: ark.Tensor,
-        start_pos: int,
-        freqs_cis: ark.Tensor,
-        mask: Optional[ark.Tensor],
-    ):
-        h = self.tok_embeddings(tokens)
-
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h)
-        return output
diff --git a/examples/llama/plan_llama2_7b_b1_s2048.json b/examples/llama/plan_llama2_7b_b1_s2048.json
deleted file mode 100644
index b0bc757dc..000000000
--- a/examples/llama/plan_llama2_7b_b1_s2048.json
+++ /dev/null
@@ -1,1206 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "ROCM_942",
-  "NumProcessors": 304,
-  "NumWarpsPerProcessor": 8,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Cast",
-          "Name": "cast",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":11,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":13,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":14,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":12,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":15,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":13,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":17,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":12,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":11,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":18,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":14,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":19,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":7,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":20,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "Cast",
-          "Name": "cast_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":21,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":15,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":22,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":0,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":24,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":25,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Rope",
-          "Name": "rope",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":30,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":17,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":33,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":34,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":20,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,1,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 10,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":1,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":26,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":27,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Rope",
-          "Name": "rope_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":31,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":18,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":10,"DataType":"FP16","Shape":[1,2048,1,128],"Strides":[1,2048,1,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,1,128],"Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":35,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":36,"DataType":"FP16","Shape":[1,2048,32,128],"Strides":[1,2048,32,128],"Offsets":[0,0,0,0],"PaddedShape":[1,2048,32,128],"Buffer":{"Id":21,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 13,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":23,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":16,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":28,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":29,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":19,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 15,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":38,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":22,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":42,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":24,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":43,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 4096
-          }
-        },
-        {
-          "Type": "ScalarMul",
-          "Name": "mul_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":44,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":25,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":45,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Factor": {"FLOAT":0.0883883461356163}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 4096
-          }
-        }
-      ]
-    },
-    {
-      "Id": 17,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMax",
-          "Name": "reduce_max",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":47,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        },
-        {
-          "Type": "Sub",
-          "Name": "sub",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":48,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":27,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":46,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,2048],
-            "NumTasks": 65536
-          }
-        },
-        {
-          "Type": "Exp",
-          "Name": "exp",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":49,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,2048],
-            "NumTasks": 65536
-          }
-        },
-        {
-          "Type": "ReduceSum",
-          "Name": "reduce_sum",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":51,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":3},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 65536
-          }
-        },
-        {
-          "Type": "Div",
-          "Name": "div",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":52,"DataType":"FP16","Shape":[1,32,2048,1],"Strides":[1,32,2048,1],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,1],"Buffer":{"Id":28,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":50,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,2048],
-            "NumTasks": 65536
-          }
-        }
-      ]
-    },
-    {
-      "Id": 22,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":53,"DataType":"FP16","Shape":[1,32,2048,2048],"Strides":[1,32,2048,2048],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,2048],"Buffer":{"Id":26,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":40,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":23,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":54,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":55,"DataType":"FP16","Shape":[1,32,2048,128],"Strides":[1,32,2048,128],"Offsets":[0,0,0,0],"PaddedShape":[1,32,2048,128],"Buffer":{"Id":29,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":false}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 24,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":58,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":30,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":3,"DataType":"FP16","Shape":[4096,4096],"Strides":[4096,4096],"Offsets":[0,0],"PaddedShape":[4096,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":59,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Add",
-          "Name": "add",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":60,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":31,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":61,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Cast",
-          "Name": "cast_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":63,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_4",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":65,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    },
-    {
-      "Id": 28,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "ReduceMean",
-          "Name": "reduce_mean_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":66,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":34,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":67,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "Axis": {"INT":2},
-            "KeepDim": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 1,
-            "ImplType": "WarpWise",
-            "SramBytes": 0,
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 29,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Rsqrt",
-          "Name": "rsqrt_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":68,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":35,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":69,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [64,1],
-            "NumTasks": 32
-          }
-        }
-      ]
-    },
-    {
-      "Id": 30,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_5",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":64,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":33,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":70,"DataType":"FP32","Shape":[1,2048,1],"Strides":[1,2048,1],"Offsets":[0,0,0],"PaddedShape":[1,2048,1],"Buffer":{"Id":36,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":71,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":8,"DataType":"FP32","Shape":[1,1,4096],"Strides":[1,1,4096],"Offsets":[0,0,0],"PaddedShape":[1,1,4096],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":72,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        },
-        {
-          "Type": "Cast",
-          "Name": "cast_3",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":73,"DataType":"FP32","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":37,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":74,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,4096],
-            "NumTasks": 2048
-          }
-        }
-      ]
-    },
-    {
-      "Id": 33,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_6",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":4,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":76,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 688
-          }
-        },
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":78,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 688
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":77,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":39,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":79,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":40,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":80,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 688
-          }
-        }
-      ]
-    },
-    {
-      "Id": 36,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":75,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":82,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 602
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":83,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 602
-          }
-        }
-      ]
-    },
-    {
-      "Id": 37,
-      "NumWarps": 4,
-      "SramBytes": 16480,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_7",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":102,"DataType":"FP16","Shape":[1,1792,4096],"Strides":[1,2048,4096],"Offsets":[0,256,0],"PaddedShape":[1,1792,4096],"Buffer":{"Id":38,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":6,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":101,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":100,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 16480,
-            "TileShapeMNK": [128,128,32],
-            "NumTasks": 172
-          }
-        },
-        {
-          "Type": "Mul",
-          "Name": "mul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":81,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":41,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":83,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":42,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":84,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,1792,11008],"Strides":[1,2048,11008],"Offsets":[0,256,0],"PaddedShape":[1,1792,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [128,128],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 38,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_8",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":85,"DataType":"FP16","Shape":[1,2048,11008],"Strides":[1,2048,11008],"Offsets":[0,0,0],"PaddedShape":[1,2048,11008],"Buffer":{"Id":43,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":5,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":86,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [256,128,32],
-            "NumTasks": 256
-          }
-        },
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":62,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":32,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":87,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":44,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":88,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":89,"DataType":"FP16","Shape":[1,2048,4096],"Strides":[1,2048,4096],"Offsets":[0,0,0],"PaddedShape":[1,2048,4096],"Buffer":{"Id":45,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 0,
-            "Tile": [256,128],
-            "NumTasks": 256
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,2048],"Granularity":7}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,2048],"Granularity":7}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":10,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":13,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,49344],
-          "TaskGroups": [
-            {"TaskId":15,"TaskRange":[0,4096],"Granularity":2}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":17,"TaskRange":[0,65536],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":22,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":24,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":28,"TaskRange":[0,2048],"Granularity":7}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,32],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,32],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":29,"TaskRange":[0,32],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":30,"TaskRange":[0,2048],"Granularity":7}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,49344],
-          "TaskGroups": [
-            {"TaskId":33,"TaskRange":[0,688],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,8],
-          "SramRange": [0,49344],
-          "TaskGroups": [
-            {"TaskId":36,"TaskRange":[0,602],"Granularity":2},
-            {"TaskId":37,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,256],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,256],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":38,"TaskRange":[0,256],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/examples/tutorial/model_test_tutorial.py b/examples/tutorial/model_test_tutorial.py
deleted file mode 100644
index c83d0d15e..000000000
--- a/examples/tutorial/model_test_tutorial.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import ark
-import torch
-import torch.optim as optim
-
-
-# Set random seed for reproducibility.
-torch.manual_seed(42)
-
-
-# Let's first define a linear layer using ARK.
-class ARKLinear(ark.Module):
-    def __init__(self, weight):
-        super().__init__()
-        self.weight = weight
-
-    def forward(self, input):
-        self.saved_input = input
-        output = ark.matmul(input, self.weight, transpose_other=True)
-        return output
-
-    def backward(self, grad_output):
-        grad_weight = ark.matmul(
-            grad_output, self.saved_input, transpose_input=True
-        )
-        grad_input = ark.matmul(grad_output, self.weight, transpose_other=False)
-        self.weight.update_gradient(grad_weight)
-        return grad_input, grad_weight
-
-
-# Let's use our previous module to define a double linear layer.
-class MyARKModule(ark.Module):
-    def __init__(self, weight0, weight1):
-        super().__init__()
-        self.linear1 = ARKLinear(weight0)
-        self.linear2 = ARKLinear(weight1)
-
-    def forward(self, x):
-        x = self.linear1.forward(x)
-        x = self.linear2.forward(x)
-        return x
-
-    def backward(self, grad_output):
-        grad_x, grad_weight2 = self.linear2.backward(grad_output)
-        grad_x, grad_weight1 = self.linear1.backward(grad_x)
-        return grad_x, grad_weight1, grad_weight2
-
-
-# Define a PyTorch model.
-class SimpleModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.layers = torch.nn.Sequential(
-            torch.nn.Linear(256, 256, bias=False),  # Layer 0
-            torch.nn.Linear(256, 256, bias=False),  # Layer 1
-            torch.nn.Linear(256, 256, bias=False),  # Layer 2
-            torch.nn.Linear(256, 256, bias=False),  # Layer 3
-            torch.nn.Linear(256, 256, bias=False),  # Layer 4
-            torch.nn.ReLU(),  # Activation
-        )
-
-    def forward(self, x):
-        return self.layers(x)
-
-
-# Function to compare the gradients of two models of the same architecture and parameter order.
-def compare_grad(ark_model, torch_model, atol=1e-4, rtol=1e-2):
-    ark_params = list(ark_model.named_parameters())
-    torch_params = list(torch_model.named_parameters())
-    for (ark_name, ark_param), (torch_name, torch_param) in zip(
-        ark_params, torch_params
-    ):
-        if (ark_param.grad is None) ^ (torch_param.grad is None):
-            print("Exactly one of the gradients is None")
-        else:
-            grads_equal = torch.allclose(
-                ark_param.grad, torch_param.grad, atol=atol, rtol=rtol
-            )
-            if not grads_equal:
-                print(
-                    f"Gradient for {ark_name} when compared to {torch_name} is different:"
-                )
-                print(f"ARK gradient: {ark_param.grad}")
-                print(f"Torch gradient: {torch_param.grad}")
-
-
-# For our ARK model we will replace the first two layers with ARK layers.
-def replace_layers_with_ark(model):
-    weight_0 = torch.nn.Parameter(
-        model.layers[0].weight.to("cuda:0").requires_grad_(True)
-    )
-    weight_1 = torch.nn.Parameter(
-        model.layers[1].weight.to("cuda:0").requires_grad_(True)
-    )
-    ark_module = ark.RuntimeModule(MyARKModule(weight_0, weight_1))
-    model.layers[0] = ark_module
-    del model.layers[1]
-
-    # Since we replaced the PyTorch layer with an ARK layer, we need to register the PyTorch parameters
-    # our ARK module utilizes with the original PyTorch model so ARK can leverage PyTorch's optimizers.
-    model.register_parameter("weight_0", weight_0)
-    model.register_parameter("weight_1", weight_1)
-
-    return model
-
-
-# Instantiate our models.
-pytorch_model = SimpleModel()
-ark_model = SimpleModel()
-
-
-# Ensure both models have the same weights.
-ark_model.load_state_dict(pytorch_model.state_dict())
-ark_model = replace_layers_with_ark(ark_model)
-
-
-# Move both models to GPU.
-pytorch_model.to("cuda:0")
-ark_model.to("cuda:0")
-
-# Now let's run the models on some random input.
-input_torch = torch.randn(128, 256).to("cuda:0").requires_grad_(True)
-input_ark = input_torch.clone().detach().requires_grad_(True)
-
-
-# Define an arbitrary target.
-target = torch.randn(128, 256).to("cuda:0")
-
-loss_fn = torch.nn.MSELoss()
-optim_torch = optim.SGD(pytorch_model.parameters(), lr=0.01)
-optim_ark = optim.SGD(ark_model.parameters(), lr=0.01)
-
-num_iters = 5
-for iter in range(num_iters):
-    print(f"Iteration {iter+1}/{num_iters}")
-
-    optim_torch.zero_grad()
-    optim_ark.zero_grad()
-
-    pytorch_output = pytorch_model(input_torch)
-    ark_output = ark_model(input_ark)
-
-    assert torch.allclose(pytorch_output, ark_output, atol=1e-4, rtol=1e-2)
-
-    # Compute losses.
-    torch_loss = loss_fn(pytorch_output, target)
-    ark_loss = loss_fn(ark_output, target)
-
-    # See how ARK's loss compares to PyTorch's loss.
-    print(f"\nPyTorch loss: {torch_loss.item()}")
-    print(f"\nARK loss: {ark_loss.item()}\n")
-    assert torch.allclose(torch_loss, ark_loss, atol=1e-4, rtol=1e-2)
-
-    # Perform a backward pass.
-    torch_loss.backward()
-    ark_loss.backward()
-
-    optim_torch.step()
-    optim_ark.step()
-
-    # Ensure gradients of both models are updated accordingly.
-    compare_grad(ark_model, pytorch_model)
diff --git a/python/ark/__init__.py b/python/ark/__init__.py
index 039d6f6f9..63480262c 100644
--- a/python/ark/__init__.py
+++ b/python/ark/__init__.py
@@ -30,7 +30,7 @@ def set_world_size(world_size):
 
 from .init import init
 from .tensor import Dims, Tensor, Parameter
-from .module import Module, RuntimeModule
+from .module import Module
 from .runtime import *
 from .serialize import save, load
 from .data_type import *
diff --git a/python/ark/module.py b/python/ark/module.py
index bc7d3973d..37326de9b 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -201,16 +201,3 @@ def backward(ctx, *grad_outputs):
                 pytorch_grad = param.staged_tensor.to_torch()
                 param.torch_param.grad = pytorch_grad
         return (None, *grad_inputs)
-
-
-class RuntimeModule(torch.nn.Module):
-    """
-    Wraps an ARK module to be used as a PyTorch autograd function.
-    """
-
-    def __init__(self, ark_module):
-        super().__init__()
-        self.ark_module = ark_module
-
-    def forward(self, *args, **kwargs):
-        return _Function.apply(self.ark_module, *args, **kwargs)
diff --git a/python/unittest/test_conversion.py b/python/unittest/test_conversion.py
deleted file mode 100644
index 7dbd48151..000000000
--- a/python/unittest/test_conversion.py
+++ /dev/null
@@ -1,281 +0,0 @@
-import pytest
-import numpy as np
-import ark
-from typing import Callable
-
-try:
-    import torch
-
-    _no_torch = False
-except ImportError:
-    _no_torch = True
-
-# ARK to Torch tests
-
-
-def initialize_tensor(dimensions, dtype):
-    tensor = ark.tensor(dimensions, dtype)
-    tensor_host = np.random.rand(*dimensions).astype(dtype.to_numpy())
-    return tensor, tensor_host
-
-
-# Test function to validate the integrity of the PyTorch view of the ARK tensor,
-# including its data and attributes such as shape and data type.
-@pytest.mark.parametrize("num_dims,size", [(1, 5), (1, 1024), (2, 5), (2, 32)])
-@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
-def test_values_fixed_dims(num_dims: int, size: int, dtype: ark.DataType):
-    ark.init()
-    dimensions = [size] * num_dims
-
-    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
-    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
-    output_tensor = ark.add(input_tensor, other_tensor)
-
-    runtime = ark.Runtime()
-    runtime.launch()
-
-    input_tensor.from_numpy(input_tensor_host)
-    other_tensor.from_numpy(other_tensor_host)
-
-    input_view = input_tensor.to_torch()
-    other_view = other_tensor.to_torch()
-    output_view = output_tensor.to_torch()
-
-    runtime.run()
-
-    input_view_numpy = input_view.cpu().numpy()
-    other_view_numpy = other_view.cpu().numpy()
-    output_view_numpy = output_view.cpu().numpy()
-
-    output_tensor_host = output_tensor.to_numpy()
-
-    runtime.stop()
-    runtime.reset()
-
-    assert np.allclose(input_tensor_host, input_view_numpy)
-    assert np.allclose(other_tensor_host, other_view_numpy)
-    assert np.allclose(output_tensor_host, output_view_numpy)
-
-
-# Function to check if there is a difference between two arrays at a specific index
-def check_diff(input_tensor_host, input_view_numpy, value, index):
-    mask = np.ones(input_tensor_host.shape, dtype=bool)
-    mask[index] = False
-    if not np.allclose(input_tensor_host[mask], input_view_numpy[mask]):
-        print("Difference found at index: ", index)
-        return False
-    if input_view_numpy[index] != value:
-        print(input_view_numpy[index], value)
-        return False
-    return True
-
-
-# Test function to check if changes to the torch views are reflected in the original tensors
-@pytest.mark.parametrize("dtype", [ark.fp16, ark.fp32])
-def test_ark_to_torch_aliasing(dtype: ark.DataType):
-    ark.init()
-    dimensions = [4, 4]
-    input_tensor, input_tensor_host = initialize_tensor(dimensions, dtype)
-    other_tensor, other_tensor_host = initialize_tensor(dimensions, dtype)
-    output_tensor = ark.mul(input_tensor, other_tensor)
-    runtime = ark.Runtime()
-    runtime.launch()
-    input_tensor.from_numpy(input_tensor_host)
-    other_tensor.from_numpy(other_tensor_host)
-
-    input_view = input_tensor.to_torch()
-    other_view = other_tensor.to_torch()
-    output_view = output_tensor.to_torch()
-    # make changes to the views
-    input_view[1, 1] = 20
-    other_view[0, 0] = 30
-    runtime.run()
-    output_view[3, 0] = 40
-
-    output_tensor_host = output_tensor.to_numpy()
-    input_view_numpy = input_view.cpu().numpy()
-    other_view_numpy = other_view.cpu().numpy()
-    output_view_numpy = output_view.cpu().numpy()
-    # Check if changes to the views are reflected in the original tensors
-    print(input_view_numpy)
-    assert check_diff(input_tensor_host, input_view_numpy, 20, (1, 1))
-    assert check_diff(other_tensor_host, other_view_numpy, 30, (0, 0))
-    assert check_diff(output_tensor_host, output_view_numpy, 40, (3, 0))
-
-    runtime.stop()
-    runtime.reset()
-
-
-def test_conversion_torch():
-    if _no_torch:
-        pytest.skip("PyTorch not available")
-
-    dimensions = [4, 4]
-
-    ark.init()
-    t = ark.constant(7, dimensions)
-
-    with ark.Runtime() as rt:
-        rt.launch()
-
-        torch_tensor = t.to_torch()
-
-        assert torch_tensor.shape == (4, 4)
-        assert torch_tensor.dtype == torch.float32
-        assert torch_tensor.device.type == "cuda"
-        assert torch.all(torch_tensor == 0)
-
-        rt.run()
-
-        torch_tensor = t.to_torch()
-        assert torch.all(torch_tensor == 7)
-
-
-# Torch to ARK tests
-
-ArkBinOp = Callable[[ark.Tensor, ark.Tensor], ark.Tensor]
-TorchBinOp = Callable[[torch.Tensor, torch.Tensor], torch.Tensor]
-ArkUnOp = Callable[[ark.Tensor], ark.Tensor]
-TorchUnOp = Callable[[torch.Tensor], torch.Tensor]
-
-
-# Verify the accuracy of binary operations involving ARK view tensors
-@pytest.mark.parametrize(
-    "dtype, ark_op, torch_op, tensor_dims",
-    [(torch.float16, ark.add, torch.add, (2, 3))],
-)
-def test_bin_op(dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims):
-    ark.init()
-    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
-    input_ark_view = ark.placeholder(
-        shape=tensor_dims,
-        dtype=ark.DataType.from_torch(dtype),
-        data=input_tensor.data_ptr(),
-    )
-    other_ark_view = ark.placeholder(
-        shape=tensor_dims,
-        dtype=ark.DataType.from_torch(dtype),
-        data=other_tensor.data_ptr(),
-    )
-    output = ark_op(input_ark_view, other_ark_view)
-    runtime = ark.Runtime()
-    runtime.launch()
-    runtime.run()
-    output_host = output.to_numpy()
-    runtime.stop()
-    runtime.reset()
-    assert np.allclose(output_host, expected_output)
-
-
-# Verify the accuracy of unary operations involving ARK view tensors
-@pytest.mark.parametrize(
-    "dtype, ark_op, torch_op, tensor_dims",
-    [(torch.float16, ark.exp, torch.exp, (3, 3))],
-)
-def test_unary_op(dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims):
-    ark.init()
-    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    expected_output = torch_op(input_tensor).cpu().numpy()
-    input_ark_view = ark.placeholder(
-        shape=tensor_dims,
-        dtype=ark.DataType.from_torch(dtype),
-        data=input_tensor,
-    )
-    output = ark_op(input_ark_view)
-    runtime = ark.Runtime()
-    runtime.launch()
-    runtime.run()
-    output_host = output.to_numpy()
-    runtime.stop()
-    runtime.reset()
-    assert np.allclose(output_host, expected_output)
-
-
-# Test function to check if changes in torch tensors are reflected in ARK views
-@pytest.mark.parametrize("dtype, tensor_dims", [(torch.float16, (64, 64))])
-def test_torch_to_ark_aliasing(dtype, tensor_dims):
-    ark.init()
-    # Initialize a PyTorch tensor
-    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-
-    input_ark_view = ark.placeholder(
-        tensor_dims, dtype=ark.DataType.from_torch(dtype), data=input_tensor
-    )
-    other_ark_view = ark.placeholder(
-        tensor_dims, dtype=ark.DataType.from_torch(dtype), data=other_tensor
-    )
-
-    output = ark.add(input_ark_view, other_ark_view)
-    # Perform in place operations
-    input_tensor += other_tensor
-    other_tensor += input_tensor
-    expected_output = (input_tensor + other_tensor).cpu().numpy()
-
-    runtime = ark.Runtime()
-    runtime.launch()
-    runtime.run()
-    output_host = output.to_numpy()
-    runtime.stop()
-    runtime.reset()
-    assert np.allclose(output_host, expected_output)
-
-
-# Staged View Tests
-
-
-@pytest.mark.parametrize(
-    "dtype, ark_op, torch_op, tensor_dims",
-    [(torch.float16, ark.add, torch.add, (2, 3))],
-)
-def test_bin_op_staged(
-    dtype, ark_op: ArkBinOp, torch_op: TorchBinOp, tensor_dims
-):
-    ark.init()
-    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    other_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    expected_output = torch_op(input_tensor, other_tensor).cpu().numpy()
-    input_ark_view = ark.placeholder(
-        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
-    )
-    other_ark_view = ark.placeholder(
-        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
-    )
-    output = ark_op(input_ark_view, other_ark_view)
-    runtime = ark.Runtime()
-    tensor_mapping = {
-        input_ark_view: input_tensor,
-        other_ark_view: other_tensor,
-    }
-    runtime.launch(tensor_mappings=tensor_mapping)
-    runtime.run()
-    output_host = output.to_numpy()
-    runtime.stop()
-    runtime.reset()
-    assert np.allclose(output_host, expected_output)
-
-
-@pytest.mark.parametrize(
-    "dtype, ark_op, torch_op, tensor_dims",
-    [(torch.float16, ark.exp, torch.exp, (3, 3))],
-)
-def test_unary_op_staged(
-    dtype, ark_op: ArkUnOp, torch_op: TorchUnOp, tensor_dims
-):
-    ark.init()
-    input_tensor = torch.randn(tensor_dims, dtype=dtype, device="cuda:0")
-    expected_output = torch_op(input_tensor).cpu().numpy()
-    input_ark_view = ark.placeholder(
-        shape=tensor_dims, dtype=ark.DataType.from_torch(dtype)
-    )
-    output = ark_op(input_ark_view)
-    runtime = ark.Runtime()
-    tensor_mapping = {input_ark_view: input_tensor}
-    runtime.launch(loop_mode=False)
-    runtime.run(tensor_mappings=tensor_mapping)
-    output_host = output.to_numpy()
-    runtime.stop()
-    runtime.reset()
-    assert np.allclose(output_host, expected_output)
diff --git a/python/unittest/test_torch_tracer.py b/python/unittest/test_torch_tracer.py
deleted file mode 100644
index e8cb009e6..000000000
--- a/python/unittest/test_torch_tracer.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-from unittest_common import ark, pytest_ark
-
-
-@pytest_ark(need_torch=True)
-def test_torch_tracer_module():
-    import torch
-    from ark.torch.tracer import tracer
-
-    @tracer
-    class TestModule(torch.nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.param = torch.nn.Parameter(torch.randn(1024, 1024))
-
-        def forward(self, x: torch.Tensor) -> torch.Tensor:
-            return x + self.param
-
-    model = TestModule().to("cuda:0")
-    x = torch.randn(1024, 1024).to("cuda:0")
-    y = model(x)
-    y2 = model.forward_ark(x)
-    assert torch.allclose(y, y2)
diff --git a/python/unittest/unittest_common.py b/python/unittest/unittest_common.py
deleted file mode 100644
index 0c385e89a..000000000
--- a/python/unittest/unittest_common.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import pytest
-import ark
-
-
-def pytest_ark(need_torch: bool = False):
-    """
-    Decorator for ARK unit tests.
-    """
-
-    def decorator(test_func):
-        if need_torch:
-            try:
-                import torch
-            except ImportError:
-                return pytest.mark.skip(reason="torch is not installed")(
-                    test_func
-                )
-
-        def wrapper(*args, **kwargs):
-            ark.init()
-            test_func(*args, **kwargs)
-
-        return wrapper
-
-    return decorator

From 2d0b9f093dd1e42c2c15c3a17472a8cdb2466043 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 07:45:24 +0000
Subject: [PATCH 079/106] revert

---
 examples/llama/model_test.py | 111 +++++-----
 python/ark/data_type.py      |   2 +-
 python/ark/module.py         | 126 +-----------
 python/ark/torch/tracer.py   | 380 -----------------------------------
 4 files changed, 57 insertions(+), 562 deletions(-)
 delete mode 100644 python/ark/torch/tracer.py

diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index f559a826b..737d3ec8b 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -58,37 +58,30 @@ def run_ark(
     ]
     output = module(*module_inputs)
 
-    with ark.Runtime() as rt:
-        plan = ark.DefaultPlanner().plan()
-        with open("plan.json", "w") as f:
-            f.write(str(plan))
-        rt.launch(plan=plan)
+    runtime = ark.Runtime()
+    # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd
+    runtime.launch(num_warps_per_sm=8)
 
-        # Load model parameters
-        if state_dict:
-            print("Loading state_dict")
-            module.load_state_dict(state_dict)
-            print("Loading state_dict done")
-
-        # Load input data into tensors
-        tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
-        tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
-        for tensor, ndarray in zip(tensors, tensor_data):
-            tensor.from_numpy(ndarray)
+    # Load model parameters
+    if state_dict:
+        module.load_state_dict(state_dict)
 
-        start_time = time.time()
+    # Load input data into tensors
+    tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
+    tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
+    for tensor, ndarray in zip(tensors, tensor_data):
+        tensor.from_numpy(ndarray)
 
-        # Run the model
-        print("Run:", iterations)
+    start_time = time.time()
 
-        rt.run(iter=iterations)
-        print("Run done")
+    # Run the model
+    runtime.run(iter=iterations)
 
-        end_time = time.time()
+    end_time = time.time()
 
-        if isinstance(output, list) or isinstance(output, tuple):
-            outputs = [o.to_numpy() for o in output]
-        outputs = [output.to_numpy()]
+    if isinstance(output, list) or isinstance(output, tuple):
+        outputs = [o.to_numpy() for o in output]
+    outputs = [output.to_numpy()]
 
     return RunResults(outputs=outputs, runtime=end_time - start_time)
 
@@ -167,9 +160,7 @@ def test_module(
         else:
             prefix = module_name_prefix + "." if module_name_prefix else ""
             # Load the state_dict from the given path
-            print("Loading ckpt:", ckpt_path)
             state_dict_pt = torch.load(ckpt_path)
-            print("Loading ckpt done")
             state_dict_pt = {
                 k[len(prefix) :]: v
                 for k, v in state_dict_pt.items()
@@ -191,7 +182,6 @@ def test_module(
         rank=rank,
         world_size=world_size,
     )
-    print("Run ARK done")
 
     if not test_thru_ark_only:
         # PyTorch module
@@ -205,7 +195,6 @@ def test_module(
             inputs_pt,
             iterations=test_thru_iterations if test_thru else 1,
         )
-        print("Run PyTorch done")
 
         if test_thru:
             print(
@@ -441,43 +430,43 @@ def test_transformer_block(
         low=-1, high=1, size=(batch_size, seq_len, args.dim)
     ).astype(dtype)
 
-    # module = model_ark.Attention(
-    #     args, ark.DataType.from_numpy(dtype), rank, world_size
-    # )
+    module = model_ark.Attention(
+        args, ark.DataType.from_numpy(dtype), rank, world_size
+    )
     # module_inputs = [
     #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
     #     if isinstance(i, np.ndarray)
     #     else i
     #     for i in inputs
     # ]
-    # feature_tensor = ark.tensor(
-    #     list(feature.shape), ark.DataType.from_numpy(feature.dtype)
-    # )
-    # freqs_cis_ark_tensor = ark.tensor(
-    #     list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
-    # )
-    # output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
-
-    # print(ark.Model.get_model().serialize())
-
-    test_module(
-        module_class_ark=model_ark.TransformerBlock,
-        module_args_ark=[
-            0,
-            args,
-            ark.DataType.from_numpy(dtype),
-            rank,
-            world_size,
-        ],
-        inputs_ark=[feature, 0, freqs_cis_ark, None],
-        module_class_pt=model_pt.TransformerBlock,
-        module_args_pt=[0, args],
-        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-        module_name_prefix="layers.0",
-        rank=rank,
-        world_size=world_size,
-        test_thru=False,
+    feature_tensor = ark.tensor(
+        list(feature.shape), ark.DataType.from_numpy(feature.dtype)
+    )
+    freqs_cis_ark_tensor = ark.tensor(
+        list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
     )
+    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
+
+    ark.Model.get_model().create_nodes()
+    print(ark.Model.get_model().serialize())
+
+    # test_module(
+    #     module_class_ark=model_ark.TransformerBlock,
+    #     module_args_ark=[
+    #         0,
+    #         args,
+    #         ark.DataType.from_numpy(dtype),
+    #         rank,
+    #         world_size,
+    #     ],
+    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
+    #     module_class_pt=model_pt.TransformerBlock,
+    #     module_args_pt=[0, args],
+    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
+    #     module_name_prefix="layers.0",
+    #     rank=rank,
+    #     world_size=world_size,
+    # )
 
 
 def test_transformer(
@@ -581,7 +570,7 @@ def worker(
     # Configurations
     args = ModelArgs7B()
     batch_size = 1
-    seq_len = 2048
+    seq_len = 512
     dtype = np.float16
     world_size = ngpus
 
@@ -589,7 +578,7 @@ def worker(
     args.vocab_size = 32000
 
     # Reduce max_seq_len due to OOM from the PyTorch model
-    args.max_seq_len = 2048
+    args.max_seq_len = 512
 
     # Verify the configurations
     assert batch_size <= args.max_batch_size
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index bbb8a4534..21e61e0cb 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -86,7 +86,7 @@ def from_torch(torch_type: torch.dtype) -> "DataType":
         Raises:
             ValueError: If there is no defined conversion from torch data type to ark data type.
         """
-        for type_name, reg in _REGISTRY_DATA_TYPE.items():
+        for type_name, reg in REGISTRY_DATA_TYPE.items():
             if reg["torch"] == torch_type:
                 return DataType.from_name(type_name)
         raise ValueError(
diff --git a/python/ark/module.py b/python/ark/module.py
index 37326de9b..368f36cf7 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -3,13 +3,8 @@
 
 import logging
 import numpy as np
-from typing import Any, Dict, Union
+from typing import Any, Dict
 from .tensor import Parameter
-from .torch import torch, _no_torch
-from .runtime import Runtime
-from .model import Model
-from .data_type import DataType
-from .ops import placeholder
 from . import log
 
 __all__ = ["Module"]
@@ -21,7 +16,6 @@ class Module:
     """
 
     def __init__(self):
-        super().__init__()
         # The submodules of the module.
         self.sub_modules: dict[str, "Module"] = dict()
         # The parameters of the module.
@@ -31,19 +25,12 @@ def __setattr__(self, __name: str, __value: Any) -> None:
         """
         When setting an attribute, if the attribute is a Module, add it to
         the sub_modules. If the attribute is a Tensor and this Tensor is a
-        parameter, add it to the parameters. If the attribute is a
-        torch.nn.Parameter, convert it to an ARK Parameter before adding.
+        parameter, add it to the parameters.
         """
         if isinstance(__value, Module):
             self.register_module(__name, __value)
         elif isinstance(__value, Parameter):
             self.register_parameter(__name, __value)
-        elif not _no_torch and isinstance(__value, torch.nn.Parameter):
-            shape, dtype = list(__value.shape), DataType.from_torch(
-                __value.dtype
-            )
-            __value = Parameter(placeholder(shape, dtype, data=__value), True)
-            self.register_parameter(__name, __value)
         super().__setattr__(__name, __value)
 
     def __call__(self, *args: Any, **kwargs: Any):
@@ -73,10 +60,7 @@ def params_dict(self, prefix="") -> Dict[str, Parameter]:
         return params_dict
 
     def load_state_dict(
-        self,
-        state_dict: Dict[str, Union[np.ndarray, torch.Tensor]],
-        prefix: str = "",
-        stream: int = 0,
+        self, state_dict: Dict[str, np.ndarray], prefix: str = ""
     ):
         """
         Loads a model from a state_dict and copy the parameters to the device GPU.
@@ -87,117 +71,19 @@ def load_state_dict(
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
         for name, param in pd.items():
-            data = state_dict.get(name, None)
-            if data is None:
-                continue
-            param.copy(data, stream=stream)
+            param.from_numpy(state_dict[name])
             all_keys.remove(name)
         if all_keys:
             log.WARN(f"{len(all_keys)} unused parameter(s) in state_dict")
 
-    def state_dict(
-        self,
-        prefix: str = "",
-        mode: str = "numpy",
-        stream: int = 0,
-    ) -> Dict[str, Union[np.ndarray, torch.Tensor]]:
+    def state_dict(self, prefix: str = "") -> Dict[str, np.ndarray]:
         """
         Copies the parameters from the device GPU to the host and saves the
         model to a state_dict.
         Must be called after the executor is launched.
         """
-        if mode == "numpy":
-            return {
-                k: v.to_numpy(stream=stream)
-                for k, v in self.params_dict(prefix).items()
-            }
-        elif mode == "torch":
-            return {
-                k: v.to_torch(stream=stream)
-                for k, v in self.params_dict(prefix).items()
-            }
-        raise ValueError(f"Unsupported mode: {mode}")
+        return {k: v.to_numpy() for k, v in self.params_dict(prefix).items()}
 
     def forward(self, *args: Any, **kwargs: Any) -> Any: ...
 
     def backward(self, *args: Any, **kwargs: Any) -> Any: ...
-
-    def initialize(self):
-        for param in self.parameters.values():
-            param.initialize()
-        for module in self.sub_modules.values():
-            module.initialize()
-
-
-class _Function(torch.autograd.Function):
-    """
-    Facilitates the integration of ARK modules with PyTorch's
-    autograd system by defining custom forward and backward passes that
-    utilize the user's defined ARK module.
-    """
-
-    @staticmethod
-    def forward(ctx, ark_module, *args, **kwargs):
-        """
-        Returns a PyTorch tensor that is the result
-        of the forward pass of the ARK module.
-        """
-        Model.reset()
-        ctx.ark_module = ark_module
-        input_args, input_kwargs = [], {}
-        input_requires_grad = 0
-        for arg in args:
-            if isinstance(arg, torch.Tensor):
-                shape, dtype = list(arg.shape), DataType.from_torch(arg.dtype)
-                input_args.append(placeholder(shape, dtype, data=arg))
-                if arg.requires_grad:
-                    input_requires_grad += 1
-            else:
-                input_args.append(arg)
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                shape, dtype = list(arg.shape), DataType.from_torch(arg.dtype)
-                input_kwargs[k] = placeholder(shape, dtype, data=v)
-                if v.requires_grad:
-                    input_requires_grad += 1
-            else:
-                input_kwargs[k] = v
-        ctx.num_inp_grad = input_requires_grad
-        output = ark_module.forward(*input_args, **input_kwargs)
-        rt = Runtime.get_runtime()
-        rt.launch()
-        rt.run()
-        rt.stop()
-        output = output.to_torch()
-        return output
-
-    @staticmethod
-    def backward(ctx, *grad_outputs):
-        """
-        Converts the gradient outputs to ARK format, computes the gradients for the input
-        and parameters using the ARK module backwards pass, and updates the gradients of the corresponding
-        PyTorch parameters.
-        """
-        Model.reset()
-        # i think we should support placeholder initialization
-        # with just pytorch tensor
-        ark_grad_outputs = []
-        for grad in grad_outputs:
-            shape, dtype = list(grad.shape), DataType.from_torch(grad.dtype)
-            ark_grad_outputs.append(placeholder(shape, dtype, data=grad))
-        grads = ctx.ark_module.backward(*ark_grad_outputs)
-        grad_inputs, grad_weights = (
-            grads[: ctx.num_inp_grad],
-            grads[ctx.num_inp_grad :],
-        )
-        params_dict = ctx.ark_module.params_dict()
-        rt = Runtime.get_runtime()
-        rt.launch()
-        rt.run()
-        rt.stop()
-        grad_inputs = [grad.to_torch() for grad in grad_inputs]
-        for _, param in params_dict.items():
-            if param.staged_tensor is not None:
-                pytorch_grad = param.staged_tensor.to_torch()
-                param.torch_param.grad = pytorch_grad
-        return (None, *grad_inputs)
diff --git a/python/ark/torch/tracer.py b/python/ark/torch/tracer.py
deleted file mode 100644
index eb73d4e48..000000000
--- a/python/ark/torch/tracer.py
+++ /dev/null
@@ -1,380 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-try:
-    import torch
-except ImportError:
-    raise ImportError("torch is required to use this module")
-
-import logging
-from typing import List, Dict, Optional, Callable, Any
-
-from ..planner import Planner, Plan
-from ..tensor import Tensor
-from ..runtime import Runtime
-from ..model import Model
-from .. import ops
-
-
-__all__ = ["tracer"]
-
-
-def handle_aten_add_scalar(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Element-wise addition with a scalar"""
-    t = tensors[node.args[0].name]
-    value = node.args[1]
-    return ops.add(t, value, name=node.name)
-
-
-def handle_aten_add_tensor(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Element-wise subtraction"""
-    t1 = tensors[node.args[0].name]
-    t2 = tensors[node.args[1].name]
-    return ops.add(t1, t2, name=node.name)
-
-
-def handle_aten_sub_tensor(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Element-wise subtraction"""
-    t1 = tensors[node.args[0].name]
-    t2 = tensors[node.args[1].name]
-    return ops.sub(t1, t2, name=node.name)
-
-
-def handle_aten_mul_tensor(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Element-wise multiplication"""
-    t1 = tensors[node.args[0].name]
-    t2 = tensors[node.args[1].name]
-    return ops.mul(t1, t2, name=node.name)
-
-
-def handle_aten_t(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Transpose"""
-    t = tensors[node.args[0].name]
-    perm = list(range(len(t.shape())))
-    if len(perm) < 2:
-        raise ValueError(f"Expected at least 2 dimensions, got {len(perm)}")
-    perm[-2], perm[-1] = perm[-1], perm[-2]
-    return ops.transpose(t, perm=perm, name=node.name)
-
-
-def handle_aten_mm(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Matrix multiplication"""
-    input = tensors[node.args[0].name]
-    weight = tensors[node.args[1].name]
-    return ops.matmul(input, weight, name=node.name)
-
-
-def handle_aten_addmm(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Matrix multiplication followed by addition"""
-    bias = tensors[node.args[0].name]
-    input = tensors[node.args[1].name]
-    weight = tensors[node.args[2].name]
-    t = ops.matmul(input, weight)
-    t = ops.add(t, bias, name=node.name)
-    return t
-
-
-def handle_aten_silu(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Sigmoid Linear Unit"""
-    t = tensors[node.args[0].name]
-    return ops.mul(t, ops.sigmoid(t), name=node.name)
-
-
-def handle_aten_sum_dim_intlist(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Sum with specified dimensions"""
-    if len(node.args[1]) != 1:
-        raise NotImplementedError("Multiple dimensions are not supported")
-    t = tensors[node.args[0].name]
-    axis = node.args[1][0]
-    keepdims = node.args[2]
-    return ops.reduce_sum(t, axis=axis, keepdims=keepdims, name=node.name)
-
-
-def handle_aten_view(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Reshape"""
-    t = tensors[node.args[0].name]
-    shape = node.args[1]
-    return ops.reshape(t, shape, name=node.name)
-
-
-def handle_aten_sigmoid(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Sigmoid"""
-    t = tensors[node.args[0].name]
-    return ops.sigmoid(t, name=node.name)
-
-
-def handle_aten_empty_like(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Create an empty tensor with the same shape"""
-    t = tensors[node.args[0].name]
-    new_t = ops.tensor(t.shape(), dtype=t.dtype())
-    new_t = ops.identity(new_t, deps=[t], name=node.name)
-    return new_t
-
-
-def handle_aten_fill_scalar(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Fill a tensor with a scalar value"""
-    t = tensors[node.args[0].name]
-    value = node.args[1]
-    return ops.copy(value, t, name=node.name)
-
-
-def handle_aten_mse_loss(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Mean Squared Error loss"""
-    input = tensors[node.args[0].name]
-    target = tensors[node.args[1].name]
-    t = ops.sub(input, target)
-    t = ops.mul(t, t)
-    t = ops.reshape(t, [-1])
-    t = ops.reduce_mean(t, axis=0)
-    return t
-
-
-def handle_aten_mse_loss_backward(
-    node: torch.fx.node.Node, tensors: Dict[str, Tensor]
-) -> Tensor:
-    """Backward pass for Mean Squared Error loss"""
-    grad_output = tensors[node.args[0].name]
-    input = tensors[node.args[1].name]
-    target = tensors[node.args[2].name]
-    grad_input = ops.sub(input, target)
-    grad_input = ops.mul(grad_input, grad_output)
-    grad_input = ops.mul(grad_input, 2.0 / grad_input.shape()[0])
-    return grad_input
-
-
-_REGISTRY_FUNCTION_HANDLER: Dict[str, Callable] = {
-    "aten::add.Scalar": handle_aten_add_scalar,
-    "aten::add.Tensor": handle_aten_add_tensor,
-    "aten::sub.Tensor": handle_aten_sub_tensor,
-    "aten::mul.Tensor": handle_aten_mul_tensor,
-    "aten::t": handle_aten_t,
-    "aten::mm": handle_aten_mm,
-    "aten::addmm": handle_aten_addmm,
-    "aten::silu": handle_aten_silu,
-    "aten::sum.dim_IntList": handle_aten_sum_dim_intlist,
-    "aten::view": handle_aten_view,
-    "aten::sigmoid": handle_aten_sigmoid,
-    "aten::empty_like": handle_aten_empty_like,
-    "aten::fill.Scalar": handle_aten_fill_scalar,
-    "aten::mse_loss": handle_aten_mse_loss,
-    "aten::mse_loss_backward": handle_aten_mse_loss_backward,
-}
-
-
-class Tracer:
-    def __init__(self):
-        self.tensors: Dict[str, Tensor] = {}
-        self.params: Optional[List[torch.nn.Parameter]] = None
-        self.params_idx: int = 0
-        self.inputs_fw: List[Tensor] = []
-        self.inputs_bw: List[Tensor] = []
-        self.outputs_fw: List[Tensor] = []
-        self.outputs_bw: List[Tensor] = []
-        self.plan_fw: Optional[Plan] = None
-        self.plan_bw: Optional[Plan] = None
-        self.device: Optional[torch.device] = None
-        self.failed: bool = False
-        self.launched_fw: bool = False
-        self.launched_bw: bool = False
-
-    def __call__(self, target: Callable) -> Callable:
-        is_module = issubclass(target, torch.nn.Module)
-        is_function = callable(target) and not isinstance(target, type)
-        if not is_module and not is_function:
-            raise ValueError("Tracer can only be applied to a subclass of `torch.nn.Module` or a function")
-        if is_function:
-            return torch._dynamo.optimize(self.autograd_trace_)(target)
-
-        target.forward_torch = target.forward
-
-        def forward_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
-            if self.plan_fw is None:
-                return instance.forward_torch(*args, **kwargs)
-            rt = Runtime.get_runtime()
-            if not self.launched_fw:
-                rt.launch(
-                    plan=self.plan_fw,
-                    device_id=self.device.index,
-                    loop_mode=False,
-                )
-                self.launched_fw = True
-                self.launched_bw = False
-
-            ph_map = {ph: data for ph, data in zip(self.inputs_fw, args)}
-            rt.run(tensor_mappings=ph_map)
-            # TODO: how to get the output tensor(s)?
-            return self.outputs_fw[0]
-
-        def backward_wrapper(instance: torch.nn.Module, *args, **kwargs):
-            if self.plan_bw is None:
-                return instance.forward_torch(*args, **kwargs)
-            rt = Runtime.get_runtime()
-            if not self.launched_bw:
-                rt.launch(
-                    plan=self.plan_bw,
-                    device_id=self.device.index,
-                    loop_mode=False,
-                )
-                self.launched_bw = True
-                self.launched_fw = False
-
-            ph_map = {ph: data for ph, data in zip(self.inputs_bw, args)}
-            rt.run(tensor_mappings=ph_map)
-            for i, param in enumerate(self.params):
-                param.grad = self.outputs_bw[i].to_torch()
-
-        def call_wrapper(instance: torch.nn.Module, *args, **kwargs) -> Any:
-            if self.params is None:
-                params = []
-                for _, param in instance.named_parameters(remove_duplicate=False):
-                    params.append(param)
-                for _, param in instance.named_buffers(remove_duplicate=False):
-                    params.append(param)
-                self.params = params
-
-            @torch._dynamo.optimize(self.autograd_trace_)
-            def call(*args, **kwargs):
-                return instance.forward_torch(*args, **kwargs)
-
-            return call(*args, **kwargs)
-
-        target.forward_ark = forward_wrapper
-        target.backward_ark = backward_wrapper
-        target.__call__ = call_wrapper
-        return target
-
-    def autograd_trace_(
-        self, gm: torch.nn.Module, _: List[torch.Tensor]
-    ) -> Callable:
-        def fw_compiler(gm: torch.fx.GraphModule, _):
-            logging.info("==== FW Starts ====")
-            return self.autograd_trace_impl_(gm, _, True)
-
-        def bw_compiler(gm: torch.fx.GraphModule, _):
-            logging.info("==== BW Starts ====")
-            return self.autograd_trace_impl_(gm, _, False)
-
-        return torch._dynamo.backends.common.aot_autograd(
-            fw_compiler=fw_compiler, bw_compiler=bw_compiler
-        )(gm, _)
-
-    def autograd_trace_impl_(
-        self, gm: torch.fx.GraphModule, _: List[torch.Tensor], is_fw: bool
-    ) -> Callable:
-
-        def run(args) -> Any:
-            Model.reset()
-            if not self.failed:
-                for node in gm.graph.nodes:
-                    logging.info(node.format_node(), node.meta)
-                    if not self.handle_node_(node, is_fw):
-                        print(f"Failed to handle node {node.format_node()}")
-                        self.failed = True
-                        break
-                if not self.failed:
-                    Model.set_device_id(self.device.index)
-                    if is_fw:
-                        self.plan_fw = Planner(self.device.index).plan()
-                    else:
-                        self.plan_bw = Planner(self.device.index).plan()
-            return torch.fx.Interpreter(gm).boxed_run(args)
-
-        run._boxed_call = True
-        return run
-
-    def handle_node_(self, node: torch.fx.node.Node, is_fw: bool) -> bool:
-        if node.op == "placeholder":
-            t = self.tensors.get(node.name, None)
-            if t is not None:
-                return True
-            meta = node.meta["tensor_meta"]
-            if len(self.params) > self.params_idx:
-                # placeholder for parameter
-                param = self.params[self.params_idx]
-                self.params_idx += 1
-                if param.dtype != meta.dtype:
-                    raise ValueError(
-                        f"Expected dtype {meta.dtype}, got {param.dtype}"
-                    )
-                if self.device is None:
-                    if param.device.type != "cuda":
-                        raise ValueError(
-                            f"Expected device cuda, got {param.device.type}"
-                        )
-                    self.device = param.device
-                elif self.device != param.device:
-                    raise ValueError(
-                        "All parameters must be on the same device. "
-                        f"Expected {self.device}, got {param.device}"
-                    )
-                data = param.data_ptr()
-            else:
-                # no more parameter -- remainings are inputs
-                data = 0
-            t = ops.placeholder(
-                shape=meta.shape,
-                dtype=ops.DataType.from_torch(meta.dtype),
-                name=node.name,
-                data=data,
-            )
-            self.tensors[node.name] = t
-            if data == 0:
-                if is_fw:
-                    self.inputs_fw.append(t)
-                else:
-                    self.inputs_bw.append(t)
-        elif node.op == "output":
-            outputs_list = self.outputs_fw if is_fw else self.outputs_bw
-            if outputs_list:
-                raise ValueError("Multiple output nodes are unexpected")
-            for out in node.args[0]:
-                if isinstance(out, torch.fx.node.Node):
-                    if out.name not in self.tensors:
-                        raise ValueError(f"Output tensor {out.name} not found")
-                    outputs_list.append(self.tensors[out.name])
-                else:
-                    outputs_list.append(out)
-        elif node.op == "call_function":
-            target_name = node.target.name()
-            if target_name not in _REGISTRY_FUNCTION_HANDLER:
-                logging.warning(
-                    f"Unsupported function {target_name}. Usage: {node.format_node()}"
-                )
-                return False
-            t = _REGISTRY_FUNCTION_HANDLER[target_name](node, self.tensors)
-            self.tensors[node.name] = t
-        else:
-            raise ValueError(f"Unexpected node {node.format_node()}")
-        return True
-
-
-def tracer(target: Callable):
-    return Tracer()(target)

From ec4e08163adf79cb0013563baca25ce7d6f4a8c8 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 07:45:51 +0000
Subject: [PATCH 080/106] lint

---
 python/unittest/test_runtime.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 4b40f0645..829514aa6 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -64,4 +64,3 @@ def test_runtime_reuse_plans():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
     runtime.reset()
-

From 55e73f23529962f9587e247346f3b4b8dc8fc066 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 08:29:40 +0000
Subject: [PATCH 081/106] Improve coverage

---
 python/ark/ops.py               | 115 +++--------------
 python/unittest/test.py         |   1 +
 python/unittest/test_ops.py     | 218 ++++++++++++++++++++++++++++++++
 python/unittest/test_runtime.py |   8 ++
 4 files changed, 244 insertions(+), 98 deletions(-)
 create mode 100644 python/unittest/test_ops.py

diff --git a/python/ark/ops.py b/python/ark/ops.py
index 514adcb3c..8ccf65a36 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -36,6 +36,7 @@
     "all_reduce",
     "embedding",
     "cast",
+    "copy",
     "constant",
     "ones",
     "zeros",
@@ -99,10 +100,6 @@ def add(
     name: str = "add",
 ) -> Union[Tensor, float]:
     """
-    Performs an element-wise addition operator between the `input`
-    tensor and the `other` tensor.
-    Usage:
-    tensor_add = ark.add(tensor1, tensor2)
     """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
         a = input._tensor
@@ -130,7 +127,8 @@ def cast(
     output: Tensor = NullTensor,
     name: str = "cast",
 ) -> Tensor:
-    """Type casting."""
+    """
+    """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -144,7 +142,8 @@ def constant(
     dtype: DataType = fp32,
     name: str = "constant",
 ) -> Tensor:
-    """Constant."""
+    """
+    """
     return Tensor(
         Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
     )
@@ -153,12 +152,13 @@ def constant(
 def copy(
     input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy"
 ) -> Tensor:
-    """Data caopy."""
+    """
+    """
     if output is not NullTensor:
         output = output._tensor
     if isinstance(input, Tensor):
-        intput = intput._tensor
-    return Tensor(Model.get_model().copy(intput, output, name))
+        input = input._tensor
+    return Tensor(Model.get_model().copy(input, output, name))
 
 
 def div(
@@ -168,10 +168,6 @@ def div(
     name: str = "div",
 ) -> Tensor:
     """
-    Performs an element-wise division operator between the
-    `input` tensor and the `other` tensor.
-    Usage:
-    tensor_mul = ark.div(tensor1, tensor2)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -186,7 +182,8 @@ def embedding(
     output: Tensor = NullTensor,
     name: str = "embedding",
 ) -> Tensor:
-    """Embedding layer."""
+    """
+    """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -198,9 +195,6 @@ def exp(
     input: Tensor, output: Tensor = NullTensor, name: str = "exp"
 ) -> Tensor:
     """
-    Calculates the exponential of the `input` tensor, element-wise.
-    Usage:
-    tensor_exp = ark.exp(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -211,12 +205,6 @@ def gelu(
     input: Tensor, output: Tensor = NullTensor, name: str = "gelu"
 ) -> Tensor:
     """
-    Applies the Gaussian Error Linear Unit (GELU) activation
-    function to the `input` tensor, element-wise. GELU is a smooth
-    approximation of the rectifier function and is widely used in
-    deep learning models.
-    Usage:
-    tensor_gelu = ark.gelu(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -227,9 +215,6 @@ def identity(
     input: Tensor, deps: List[Tensor] = [], name: str = "identity"
 ) -> Tensor:
     """
-    Returns an identical tensor of `input` with execution dependencies `deps`.
-    Usage:
-    tensor_identity = ark.identity(tensor, deps=[tensor1, tensor2])
     """
     dep_tensors = []
     for dep in deps:
@@ -248,13 +233,6 @@ def matmul(
     name: str = "matmul",
 ) -> Tensor:
     """
-    Performs matrix multiplication between the `input` tensor and
-    `other` tensor, storing the result in `output`. Optional
-    parameters allow controlling the behavior of the multiplication,
-    such as transposing the input tensors and applying a ReLU
-    activation.
-    Usage:
-    tensor_matmul = ark.matmul(tensor1, tensor2)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -277,10 +255,6 @@ def mul(
     name: str = "mul",
 ) -> Tensor:
     """
-    Performs an element-wise multiplication operator between the
-    `input` tensor and the `other` tensor.
-    Usage:
-    tensor_mul = ark.mul(tensor1, tensor2)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -291,7 +265,6 @@ def mul(
 
 def noop(input: Tensor, name: str = "noop"):
     """
-    No operation. Returns nothing.
     """
     Model.get_model().noop(input._tensor, name)
 
@@ -304,10 +277,6 @@ def reduce_max(
     name: str = "reduce_max",
 ) -> Tensor:
     """
-    Performs reduction along the `axis` of the `input` tensor and
-    stores the result in `output`.
-    Usage:
-    tensor_reduce_max = ark.reduce_max(tensor, axis=1)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -326,10 +295,6 @@ def reduce_mean(
     name: str = "reduce_mean",
 ) -> Tensor:
     """
-    Performs reduction along the `axis` of the `input` tensor and
-    stores the result in `output`.
-    Usage:
-    tensor_reduce_mean = ark.reduce_mean(tensor, axis=1)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -348,12 +313,6 @@ def reduce_sum(
     name: str = "reduce_sum",
 ) -> Tensor:
     """
-    Performs reduction along the `axis` of the `input` tensor and
-    stores the result in `output`.
-    Usage:
-    # tensors shape is [64, 128]
-    tensor_reduce_sum = ark.reduce_sum(tensor, axis=1)
-    # tensor_reduce_sum is a tensor with shape [64, 1]
     """
     if output is not NullTensor:
         output = output._tensor
@@ -368,10 +327,6 @@ def relu(
     input: Tensor, output: Tensor = NullTensor, name: str = "relu"
 ) -> Tensor:
     """
-    Applies the ReLU activation function to the `input` tensor,
-    element-wise.
-    Usage:
-    tensor_relu = ark.relu(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -418,9 +373,6 @@ def rope(
     name: str = "rope",
 ) -> Tensor:
     """
-    Calculates the square root of the `input` tensor, element-wise.
-    Usage:
-    tensor_rsqrt = ark.rsqrt(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -433,9 +385,6 @@ def rsqrt(
     input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt"
 ) -> Tensor:
     """
-    Calculates the square root of the `input` tensor, element-wise.
-    Usage:
-    tensor_rsqrt = ark.rsqrt(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -446,13 +395,6 @@ def sharding(
     input: Tensor, axis: int, dim_per_shard: int, name: str = "sharding"
 ) -> List[Tensor]:
     """
-    Shard `input` along `axis` into `dim_per_shard`-dimensional shards.
-    Usage:
-    # tensors shape is [64, 128]
-    tensor_sharding = ark.sharding(tensor, axis=1, dim_per_shard=64)
-    # tensor_sharding is a list of 2 tensors, each of which has shape [64, 64]
-    # The first tensor's buffer is the same as the first 64 columns of tensor
-    # The second tensor's buffer is the same as the last 64 columns of tensor
     """
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
@@ -464,10 +406,6 @@ def sigmoid(
     input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid"
 ) -> Tensor:
     """
-    Applies the Sigmoid activation function to the `input` tensor,
-    element-wise.
-    Usage:
-    tensor_sigmoid = ark.sigmoid(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -478,9 +416,6 @@ def sqrt(
     input: Tensor, output: Tensor = NullTensor, name: str = "sqrt"
 ) -> Tensor:
     """
-    Calculates the square root of the `input` tensor, element-wise.
-    Usage:
-    tensor_sqrt = ark.sqrt(tensor)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -494,10 +429,6 @@ def sub(
     name: str = "sub",
 ) -> Tensor:
     """
-    Performs an element-wise addition operator between the `input`
-    tensor and the `other` tensor.
-    Usage:
-    tensor_add = ark.sub(tensor1, tensor2)
     """
     if output is not NullTensor:
         output = output._tensor
@@ -516,10 +447,6 @@ def tensor(
     name: str = "",
 ) -> Tensor:
     """
-    Construct a tensor with given shape and data type.
-    Usage:
-    tensor = ark.tensor([1, 2, 3, 4], dtype=ark.fp32)
-    tensor = ark.tensor([1, 2], dtype=ark.fp16)
     """
     return Tensor(
         _tensor(shape, dtype, strides, offsets, padded_shape, rank, name)
@@ -533,13 +460,6 @@ def transpose(
     name: str = "transpose",
 ) -> Tensor:
     """
-    Transposes the `input` tensor according to the given `perm` permutation.
-    For example, transpose(input, [0, 1 ,3, 2]) will swap the last two
-    dimensions of the input tensor. Currently, only 4D tensors are supported.
-    Usage:
-    # tensors shape is [1, 64, 128, 32]
-    tensor_transpose = ark.transpose(tensor, perm=[0, 1, 3, 2])
-    # tensor_transpose is a tensor with shape [1, 64, 32, 128]
     """
     if output is not NullTensor:
         output = output._tensor
@@ -565,14 +485,16 @@ def mean(
     output: Tensor = NullTensor,
     name: str = "mean",
 ) -> Tensor:
-    """Alias of reduce_mean."""
+    """
+    """
     return reduce_mean(input, axis, keepdims, output, name)
 
 
 def ones(
     shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
 ) -> Tensor:
-    """Ones."""
+    """
+    """
     return Tensor(
         Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
     )
@@ -587,7 +509,6 @@ def parameter(
     name: str = "",
 ) -> Parameter:
     """
-    Construct a parameter with given shape and data type.
     """
     return Parameter(
         _tensor(shape, dtype, strides, offsets, padded_shape, name)
@@ -598,9 +519,6 @@ def softmax(
     input: Tensor, output: Tensor = NullTensor, name: str = "softmax"
 ) -> Tensor:
     """
-    Applies softmax  to the `input` tensor on the last dimension.
-    Usage:
-    tensor_softmax = ark.softmax(tensor)
     """
     max = reduce_max(input, axis=-1)
     output = sub(input, max, output=output)
@@ -626,7 +544,8 @@ def layernorm(
 def zeros(
     shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros"
 ) -> Tensor:
-    """Zeros."""
+    """
+    """
     return Tensor(
         Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
     )
diff --git a/python/unittest/test.py b/python/unittest/test.py
index fe2114c71..693adb2d1 100644
--- a/python/unittest/test.py
+++ b/python/unittest/test.py
@@ -4,4 +4,5 @@
 from test_data_type import *
 from test_error import *
 from test_model import *
+from test_ops import *
 from test_runtime import *
diff --git a/python/unittest/test_ops.py b/python/unittest/test_ops.py
new file mode 100644
index 000000000..88ea3be6e
--- /dev/null
+++ b/python/unittest/test_ops.py
@@ -0,0 +1,218 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from common import ark, pytest_ark
+
+
+@pytest_ark()
+def test_ops_add():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.add(a, b)
+    assert c.shape() == [64, 64]
+
+    d = ark.add(a, 1.0)
+    assert d.shape() == [64, 64]
+
+    e = ark.add(1.0, a)
+    assert e.shape() == [64, 64]
+
+    f = ark.add(1.0, 1.0)
+    assert f == 2.0
+
+
+@pytest_ark()
+def test_ops_cast():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.cast(a, ark.fp32)
+    assert b.shape() == [64, 64]
+    assert b.dtype() == ark.fp32
+
+
+@pytest_ark()
+def test_ops_constant():
+    a = ark.constant(1.0, [64, 64])
+    assert a.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_copy():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.copy(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_div():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.div(a, b)
+    assert c.shape() == [64, 64]
+
+    d = ark.div(a, 1.0)
+    assert d.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_embedding():
+    a = ark.tensor([64, 64], ark.int32)
+    b = ark.tensor([100, 4096], ark.fp16)
+    c = ark.embedding(a, b)
+    assert c.shape() == [1, 64, 64, 4096]
+
+
+@pytest_ark()
+def test_ops_exp():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.exp(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_gelu():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.gelu(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_identity():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.identity(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_matmul():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.matmul(a, b)
+    assert c.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_mul():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.mul(a, b)
+    assert c.shape() == [64, 64]
+
+    d = ark.mul(a, 1.0)
+    assert d.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_reduce_max():
+    a = ark.tensor([64, 32], ark.fp16)
+    b = ark.reduce_max(a, axis=0, keepdims=True)
+    assert b.shape() == [1, 32]
+
+    c = ark.reduce_max(a, axis=1, keepdims=True)
+    assert c.shape() == [64, 1]
+
+    d = ark.reduce_max(a, axis=0, keepdims=False)
+    assert d.shape() == [32]
+
+    e = ark.reduce_max(a, axis=1, keepdims=False)
+    assert e.shape() == [64]
+
+
+@pytest_ark()
+def test_ops_reduce_mean():
+    a = ark.tensor([64, 32], ark.fp16)
+    b = ark.reduce_mean(a, axis=0, keepdims=True)
+    assert b.shape() == [1, 32]
+
+    c = ark.reduce_mean(a, axis=1, keepdims=True)
+    assert c.shape() == [64, 1]
+
+    d = ark.reduce_mean(a, axis=0, keepdims=False)
+    assert d.shape() == [32]
+
+    e = ark.reduce_mean(a, axis=1, keepdims=False)
+    assert e.shape() == [64]
+
+
+@pytest_ark()
+def test_ops_reduce_sum():
+    a = ark.tensor([64, 32], ark.fp16)
+    b = ark.reduce_sum(a, axis=0, keepdims=True)
+    assert b.shape() == [1, 32]
+
+    c = ark.reduce_sum(a, axis=1, keepdims=True)
+    assert c.shape() == [64, 1]
+
+    d = ark.reduce_sum(a, axis=0, keepdims=False)
+    assert d.shape() == [32]
+
+    e = ark.reduce_sum(a, axis=1, keepdims=False)
+    assert e.shape() == [64]
+
+
+@pytest_ark()
+def test_ops_relu():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.relu(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_reshape():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.reshape(a, [64, 64, 1])
+    assert b.shape() == [64, 64, 1]
+
+
+@pytest_ark()
+def test_ops_rope():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.rope(a, b)
+    assert c.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_rsqrt():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.rsqrt(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_sharding():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.sharding(a, axis=0, dim_per_shard=2)
+    assert len(b) == 32
+    assert b[0].shape() == [2, 64]
+
+
+@pytest_ark()
+def test_ops_sigmoid():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.sigmoid(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_sqrt():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.sqrt(a)
+    assert b.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_sub():
+    a = ark.tensor([64, 64], ark.fp16)
+    b = ark.tensor([64, 64], ark.fp16)
+    c = ark.sub(a, b)
+    assert c.shape() == [64, 64]
+
+    d = ark.sub(a, 1.0)
+    assert d.shape() == [64, 64]
+
+
+@pytest_ark()
+def test_ops_transpose():
+    a = ark.tensor([64, 32], ark.fp16)
+    b = ark.transpose(a, [1, 0])
+    assert b.shape() == [32, 64]
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index 10a72c082..269253e13 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -2,3 +2,11 @@
 # Licensed under the MIT license.
 
 from common import ark, pytest_ark
+
+
+@pytest_ark()
+def test_runtime_empty():
+    with ark.Runtime.get_runtime() as rt:
+        rt.launch()
+        rt.run()
+        rt.stop()

From 95f29b8b0c0ac76ff49192922418b4298c8d778b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 5 Sep 2024 08:32:46 +0000
Subject: [PATCH 082/106] lint

---
 python/ark/ops.py | 87 ++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 58 deletions(-)

diff --git a/python/ark/ops.py b/python/ark/ops.py
index 8ccf65a36..fa7879e07 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -99,8 +99,7 @@ def add(
     output: Tensor = NullTensor,
     name: str = "add",
 ) -> Union[Tensor, float]:
-    """
-    """
+    """ """
     if isinstance(input, Tensor) and isinstance(other, Tensor):
         a = input._tensor
         b = other._tensor
@@ -127,8 +126,7 @@ def cast(
     output: Tensor = NullTensor,
     name: str = "cast",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -142,8 +140,7 @@ def constant(
     dtype: DataType = fp32,
     name: str = "constant",
 ) -> Tensor:
-    """
-    """
+    """ """
     return Tensor(
         Model.get_model().constant(value, Dims(shape), dtype.ctype(), name)
     )
@@ -152,8 +149,7 @@ def constant(
 def copy(
     input: Union[Tensor, float], output: Tensor = NullTensor, name: str = "copy"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     if isinstance(input, Tensor):
@@ -167,8 +163,7 @@ def div(
     output: Tensor = NullTensor,
     name: str = "div",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
@@ -182,8 +177,7 @@ def embedding(
     output: Tensor = NullTensor,
     name: str = "embedding",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -194,8 +188,7 @@ def embedding(
 def exp(
     input: Tensor, output: Tensor = NullTensor, name: str = "exp"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().exp(input._tensor, output, name))
@@ -204,8 +197,7 @@ def exp(
 def gelu(
     input: Tensor, output: Tensor = NullTensor, name: str = "gelu"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().gelu(input._tensor, output, name))
@@ -214,8 +206,7 @@ def gelu(
 def identity(
     input: Tensor, deps: List[Tensor] = [], name: str = "identity"
 ) -> Tensor:
-    """
-    """
+    """ """
     dep_tensors = []
     for dep in deps:
         if not isinstance(dep, Tensor):
@@ -232,8 +223,7 @@ def matmul(
     transpose_other: bool = False,
     name: str = "matmul",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -254,8 +244,7 @@ def mul(
     output: Tensor = NullTensor,
     name: str = "mul",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
@@ -264,8 +253,7 @@ def mul(
 
 
 def noop(input: Tensor, name: str = "noop"):
-    """
-    """
+    """ """
     Model.get_model().noop(input._tensor, name)
 
 
@@ -276,8 +264,7 @@ def reduce_max(
     output: Tensor = NullTensor,
     name: str = "reduce_max",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -294,8 +281,7 @@ def reduce_mean(
     output: Tensor = NullTensor,
     name: str = "reduce_mean",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -312,8 +298,7 @@ def reduce_sum(
     output: Tensor = NullTensor,
     name: str = "reduce_sum",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -326,8 +311,7 @@ def reduce_sum(
 def relu(
     input: Tensor, output: Tensor = NullTensor, name: str = "relu"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().relu(input._tensor, output, name))
@@ -372,8 +356,7 @@ def rope(
     output: Tensor = NullTensor,
     name: str = "rope",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(
@@ -384,8 +367,7 @@ def rope(
 def rsqrt(
     input: Tensor, output: Tensor = NullTensor, name: str = "rsqrt"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().rsqrt(input._tensor, output, name))
@@ -394,8 +376,7 @@ def rsqrt(
 def sharding(
     input: Tensor, axis: int, dim_per_shard: int, name: str = "sharding"
 ) -> List[Tensor]:
-    """
-    """
+    """ """
     _tensor_list = Model.get_model().sharding(
         input._tensor, axis, dim_per_shard, name
     )
@@ -405,8 +386,7 @@ def sharding(
 def sigmoid(
     input: Tensor, output: Tensor = NullTensor, name: str = "sigmoid"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().sigmoid(input._tensor, output, name))
@@ -415,8 +395,7 @@ def sigmoid(
 def sqrt(
     input: Tensor, output: Tensor = NullTensor, name: str = "sqrt"
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     return Tensor(Model.get_model().sqrt(input._tensor, output, name))
@@ -428,8 +407,7 @@ def sub(
     output: Tensor = NullTensor,
     name: str = "sub",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     if isinstance(other, Tensor):
@@ -446,8 +424,7 @@ def tensor(
     rank: int = -1,
     name: str = "",
 ) -> Tensor:
-    """
-    """
+    """ """
     return Tensor(
         _tensor(shape, dtype, strides, offsets, padded_shape, rank, name)
     )
@@ -459,8 +436,7 @@ def transpose(
     output: Tensor = NullTensor,
     name: str = "transpose",
 ) -> Tensor:
-    """
-    """
+    """ """
     if output is not NullTensor:
         output = output._tensor
     if not is_list_or_tuple(perm):
@@ -485,16 +461,14 @@ def mean(
     output: Tensor = NullTensor,
     name: str = "mean",
 ) -> Tensor:
-    """
-    """
+    """ """
     return reduce_mean(input, axis, keepdims, output, name)
 
 
 def ones(
     shape: Iterable[int], dtype: DataType = fp32, name: str = "ones"
 ) -> Tensor:
-    """
-    """
+    """ """
     return Tensor(
         Model.get_model().constant(1, Dims(shape), dtype.ctype(), name)
     )
@@ -508,8 +482,7 @@ def parameter(
     padded_shape: Iterable[int] = [],
     name: str = "",
 ) -> Parameter:
-    """
-    """
+    """ """
     return Parameter(
         _tensor(shape, dtype, strides, offsets, padded_shape, name)
     )
@@ -518,8 +491,7 @@ def parameter(
 def softmax(
     input: Tensor, output: Tensor = NullTensor, name: str = "softmax"
 ) -> Tensor:
-    """
-    """
+    """ """
     max = reduce_max(input, axis=-1)
     output = sub(input, max, output=output)
     output = exp(output, output=output)
@@ -544,8 +516,7 @@ def layernorm(
 def zeros(
     shape: Iterable[int], dtype: DataType = fp32, name: str = "zeros"
 ) -> Tensor:
-    """
-    """
+    """ """
     return Tensor(
         Model.get_model().constant(0, Dims(shape), dtype.ctype(), name)
     )

From 006af245c45b43bc8b6c17afa8d328c204d49d14 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 6 Sep 2024 05:22:54 +0000
Subject: [PATCH 083/106] updates

---
 .vscode/c_cpp_properties.json             |   1 +
 ark/api/executor.cpp                      |  16 +-
 ark/api/executor_test.cpp                 |  29 ++
 examples/tutorial/default_plan.json       | 271 ---------------
 examples/tutorial/model.json              | 140 --------
 examples/tutorial/plan.json               | 358 --------------------
 examples/tutorial/plan_1_larger_tile.json | 271 ---------------
 examples/tutorial/plan_2_split_k.json     | 358 --------------------
 examples/tutorial/plan_3_overwrite.json   | 358 --------------------
 examples/tutorial/plan_tutorial.py        | 395 ----------------------
 examples/tutorial/planner_tutorial.py     |   6 +-
 examples/tutorial/quickstart_tutorial.py  |   6 -
 python/ark/executor.py                    |  26 ++
 python/ark/init.py                        |   6 +-
 python/ark/model.py                       |   7 +-
 python/ark/module.py                      |   1 -
 python/ark/runtime.py                     | 112 +++---
 python/ark/serialize.py                   |   8 +-
 python/ark/tensor.py                      | 141 ++++----
 python/unittest/test_runtime.py           |   5 +-
 python/unittest/test_tensor.py            |   2 +-
 21 files changed, 200 insertions(+), 2317 deletions(-)
 delete mode 100644 examples/tutorial/default_plan.json
 delete mode 100644 examples/tutorial/model.json
 delete mode 100644 examples/tutorial/plan.json
 delete mode 100644 examples/tutorial/plan_1_larger_tile.json
 delete mode 100644 examples/tutorial/plan_2_split_k.json
 delete mode 100644 examples/tutorial/plan_3_overwrite.json
 delete mode 100644 examples/tutorial/plan_tutorial.py
 create mode 100644 python/ark/executor.py

diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
index b4d0e7494..ac86a796e 100644
--- a/.vscode/c_cpp_properties.json
+++ b/.vscode/c_cpp_properties.json
@@ -4,6 +4,7 @@
             "name": "Linux",
             "includePath": [
                 "${workspaceFolder}/**",
+                "${workspaceFolder}/third_party/mscclpp/include",
                 "/usr/local/cuda/include",
                 "/opt/rocm/include"
             ],
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index d7e0b1a43..af1789dc1 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -858,7 +858,7 @@ class Executor::Impl {
 
    private:
     std::shared_ptr<BufferRegistry::Info> get_buffer_info(
-        const Tensor &tensor) const;
+        const Tensor &tensor, bool fail_on_null) const;
 
     std::map<PlanResourceKey, std::shared_ptr<PlanResource>> plan_resources_;
     std::shared_ptr<PlanResource> foreground_plan_resource_;
@@ -1039,11 +1039,11 @@ void Executor::Impl::barrier() {
 }
 
 std::shared_ptr<BufferRegistry::Info> Executor::Impl::get_buffer_info(
-    const Tensor &tensor) const {
+    const Tensor &tensor, bool fail_on_null) const {
     size_t buffer_id = tensor.ref()->buffer()->id();
     auto &buf_reg = BufferRegistry::get_instance();
     auto info = buf_reg.get(buffer_id);
-    if (!info || !(info->data)) {
+    if (fail_on_null && (!info || !(info->data))) {
         ERR(InvalidUsageError,
             "Tensor has no allocated memory. "
             "This is likely caused by accessing a tensor that is optimized "
@@ -1054,12 +1054,16 @@ std::shared_ptr<BufferRegistry::Info> Executor::Impl::get_buffer_info(
 }
 
 void *Executor::Impl::tensor_address(const Tensor &tensor) const {
-    return get_buffer_info(tensor)->data;
+    auto info = get_buffer_info(tensor, false);
+    if (!info || !(info->data)) {
+        return nullptr;
+    }
+    return info->data;
 }
 
 void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
                                  Stream stream, bool is_d2d) const {
-    auto info = get_buffer_info(tensor);
+    auto info = get_buffer_info(tensor, true);
     size_t device_id = info->device_id;
     GLOG(gpuSetDevice(device_id));
     std::shared_ptr<GpuStream> copy_stream;
@@ -1112,7 +1116,7 @@ void Executor::Impl::tensor_read(const Tensor &tensor, void *data, size_t bytes,
 void Executor::Impl::tensor_write(const Tensor &tensor, const void *data,
                                   size_t bytes, Stream stream,
                                   bool is_d2d) const {
-    auto info = get_buffer_info(tensor);
+    auto info = get_buffer_info(tensor, true);
     size_t device_id = info->device_id;
     GLOG(gpuSetDevice(device_id));
     std::shared_ptr<GpuStream> copy_stream;
diff --git a/ark/api/executor_test.cpp b/ark/api/executor_test.cpp
index cf3495780..22c7d7c47 100644
--- a/ark/api/executor_test.cpp
+++ b/ark/api/executor_test.cpp
@@ -3,6 +3,7 @@
 
 #include "ark/executor.hpp"
 
+#include "ark/planner.hpp"
 #include "gpu/gpu.hpp"
 #include "model/model_json.hpp"
 #include "unittest/unittest_utils.h"
@@ -54,6 +55,34 @@ ark::unittest::State test_executor() {
         // Stop & destroy automatically.
     }
 
+    // Raw executor test
+    ark::Model m;
+    auto tensor = m.tensor({1024}, ark::FP32);
+    m.noop(tensor);
+
+    ark::Planner planner(m, 0);
+    auto plan = planner.plan();
+    {
+        std::vector<float> array(1024);
+
+        ark::Executor exe;
+        UNITTEST_EQ(exe.tensor_address(tensor), nullptr);
+        UNITTEST_THROW(
+            exe.tensor_read(tensor, array.data(), array.size() * sizeof(float)),
+            ark::InvalidUsageError);
+        UNITTEST_THROW(exe.tensor_write(tensor, array.data(),
+                                        array.size() * sizeof(float)),
+                       ark::InvalidUsageError);
+        UNITTEST_THROW(exe.launch(), ark::InvalidUsageError);
+
+        exe.compile(plan, 0);
+        UNITTEST_NE(exe.tensor_address(tensor), nullptr);
+
+        exe.launch();
+        exe.run(1);
+        exe.wait();
+    }
+
     UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
     return ark::unittest::SUCCESS;
 }
diff --git a/examples/tutorial/default_plan.json b/examples/tutorial/default_plan.json
deleted file mode 100644
index bb774a5b8..000000000
--- a/examples/tutorial/default_plan.json
+++ /dev/null
@@ -1,271 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "ROCM_942",
-  "NumProcessors": 304,
-  "NumWarpsPerProcessor": 4,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 88064
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 88064
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 1,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 1,
-            "SramBytes": 0,
-            "Tile": [1,64],
-            "NumTasks": 88064
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 4,
-      "SramBytes": 24672,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_2",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}},
-            {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false}}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 4,
-            "SramBytes": 24672,
-            "TileShapeMNK": [128,256,32],
-            "NumTasks": 64
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,172],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,172],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,88064],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,88064],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,172],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,172],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,304],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,304],
-          "WarpRange": [0,1],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,88064],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,64],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,4],
-          "SramRange": [0,24672],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
\ No newline at end of file
diff --git a/examples/tutorial/model.json b/examples/tutorial/model.json
deleted file mode 100644
index c2b88bbd0..000000000
--- a/examples/tutorial/model.json
+++ /dev/null
@@ -1,140 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Nodes": [
-    {
-      "Id": 0,
-      "ProducerNodeIds": [],
-      "ConsumerNodeIds": [1,2],
-      "Op": {
-        "Type": "Matmul",
-        "Name": "matmul",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-          {"Id":1,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":4,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {
-          "TransposeInput": {"BOOL":false},
-          "TransposeOther": {"BOOL":true}
-        }
-      }
-    },
-    {
-      "Id": 1,
-      "ProducerNodeIds": [0],
-      "ConsumerNodeIds": [2],
-      "Op": {
-        "Type": "Sigmoid",
-        "Name": "sigmoid",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":6,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {}
-      }
-    },
-    {
-      "Id": 2,
-      "ProducerNodeIds": [0,1],
-      "ConsumerNodeIds": [4],
-      "Op": {
-        "Type": "Mul",
-        "Name": "mul",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":5,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-          {"Id":7,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":8,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {}
-      }
-    },
-    {
-      "Id": 3,
-      "ProducerNodeIds": [],
-      "ConsumerNodeIds": [4],
-      "Op": {
-        "Type": "Matmul",
-        "Name": "matmul_1",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":0,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-          {"Id":3,"DataType":"FP16","Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096],"Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":10,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {
-          "TransposeInput": {"BOOL":false},
-          "TransposeOther": {"BOOL":true}
-        }
-      }
-    },
-    {
-      "Id": 4,
-      "ProducerNodeIds": [2,3],
-      "ConsumerNodeIds": [5],
-      "Op": {
-        "Type": "Mul",
-        "Name": "mul_1",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":9,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-          {"Id":11,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":12,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {}
-      }
-    },
-    {
-      "Id": 5,
-      "ProducerNodeIds": [4],
-      "ConsumerNodeIds": [],
-      "Op": {
-        "Type": "Matmul",
-        "Name": "matmul_2",
-        "IsVirtual": false,
-        "ReadTensors": [
-          {"Id":13,"DataType":"FP16","Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008],"Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[]}},
-          {"Id":2,"DataType":"FP16","Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008],"Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "WriteTensors": [
-          {"Id":14,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "ResultTensors": [
-          {"Id":15,"DataType":"FP16","Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096],"Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[]}}
-        ],
-        "Args": {
-          "TransposeInput": {"BOOL":false},
-          "TransposeOther": {"BOOL":true}
-        }
-      }
-    }
-  ]
-}
diff --git a/examples/tutorial/plan.json b/examples/tutorial/plan.json
deleted file mode 100644
index 335c27549..000000000
--- a/examples/tutorial/plan.json
+++ /dev/null
@@ -1,358 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "CUDA_80",
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
-          ],
-          "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
-          ],
-          "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 64
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,64],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [64,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [64,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
diff --git a/examples/tutorial/plan_1_larger_tile.json b/examples/tutorial/plan_1_larger_tile.json
deleted file mode 100644
index 04d2e9d60..000000000
--- a/examples/tutorial/plan_1_larger_tile.json
+++ /dev/null
@@ -1,271 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "CUDA_80",
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":2,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,11008],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,64],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
diff --git a/examples/tutorial/plan_2_split_k.json b/examples/tutorial/plan_2_split_k.json
deleted file mode 100644
index 837944171..000000000
--- a/examples/tutorial/plan_2_split_k.json
+++ /dev/null
@@ -1,358 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "CUDA_80",
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":5,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":6,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
-          ],
-          "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
-          ],
-          "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 64
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,64],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [64,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [64,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
diff --git a/examples/tutorial/plan_3_overwrite.json b/examples/tutorial/plan_3_overwrite.json
deleted file mode 100644
index 335c27549..000000000
--- a/examples/tutorial/plan_3_overwrite.json
+++ /dev/null
@@ -1,358 +0,0 @@
-{
-  "Rank": 0,
-  "WorldSize": 1,
-  "Architecture": "CUDA_80",
-  "NumProcessors": 108,
-  "NumWarpsPerProcessor": 8,
-  "TaskInfos": [
-    {
-      "Id": 0,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":1,"DataType":"FP16","Buffer":{"Id":1,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":4,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 1,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Sigmoid",
-          "Name": "sigmoid",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":6,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 2,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":5,"DataType":"FP16","Buffer":{"Id":4,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":7,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":8,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 3,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":0,"DataType":"FP16","Buffer":{"Id":0,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":3,"DataType":"FP16","Buffer":{"Id":3,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[11008,4096],"Strides":[11008,4096],"Offsets":[0,0],"PaddedShape":[11008,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":10,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 4,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Mul",
-          "Name": "mul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":9,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]},
-            {"Id":11,"DataType":"FP16","Buffer":{"Id":7,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "WriteTensors": [
-            {"Id":12,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "ResultTensors": [
-            {"Id":13,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,11008],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,11008]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 172
-          }
-        }
-      ]
-    },
-    {
-      "Id": 5,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":16,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,8320],"Strides":[1,512,11008],"Offsets":[0,0,0],"PaddedShape":[1,512,8320]},
-            {"Id":17,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,8320],"Strides":[4096,11008],"Offsets":[0,0],"PaddedShape":[4096,8320]}
-          ],
-          "WriteTensors": [
-            {"Id":14,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 6,
-      "NumWarps": 8,
-      "SramBytes": 147456,
-      "Ops": [
-        {
-          "Type": "Matmul",
-          "Name": "matmul_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":18,"DataType":"FP16","Buffer":{"Id":8,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,2688],"Strides":[1,512,11008],"Offsets":[0,0,8320],"PaddedShape":[1,512,2688]},
-            {"Id":19,"DataType":"FP16","Buffer":{"Id":2,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[4096,2688],"Strides":[4096,11008],"Offsets":[0,8320],"PaddedShape":[4096,2688]}
-          ],
-          "WriteTensors": [
-            {"Id":20,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {
-            "TransposeInput": {"BOOL":false},
-            "TransposeOther": {"BOOL":true}
-          },
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 147456,
-            "TileShapeMNK": [128,256,64],
-            "NumTasks": 64
-          }
-        }
-      ]
-    },
-    {
-      "Id": 7,
-      "NumWarps": 8,
-      "SramBytes": 0,
-      "Ops": [
-        {
-          "Type": "Add",
-          "Name": "add_1",
-          "IsVirtual": false,
-          "ReadTensors": [
-            {"Id":22,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]},
-            {"Id":21,"DataType":"FP16","Buffer":{"Id":10,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "WriteTensors": [
-            {"Id":23,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "ResultTensors": [
-            {"Id":15,"DataType":"FP16","Buffer":{"Id":9,"Rank":-1,"SendTags":[],"RecvTags":[],"IsExternal":false},"Shape":[1,512,4096],"Strides":[1,512,4096],"Offsets":[0,0,0],"PaddedShape":[1,512,4096]}
-          ],
-          "Args": {},
-          "Config": {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128,256],
-            "NumTasks": 64
-          }
-        }
-      ]
-    }
-  ],
-  "ProcessorGroups": [
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":0,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":1,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":2,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":3,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":4,"TaskRange":[0,172],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,64],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":5,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [64,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [64,108],
-          "WarpRange": [0,8],
-          "SramRange": [0,147456],
-          "TaskGroups": [
-            {"TaskId":6,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    },
-    {
-      "ProcessorRange": [0,108],
-      "ResourceGroups": [
-        {
-          "ProcessorRange": [0,64],
-          "WarpRange": [0,8],
-          "SramRange": [0,0],
-          "TaskGroups": [
-            {"TaskId":7,"TaskRange":[0,64],"Granularity":1}
-          ]
-        }
-      ]
-    }
-  ]
-}
diff --git a/examples/tutorial/plan_tutorial.py b/examples/tutorial/plan_tutorial.py
deleted file mode 100644
index a2c5e3e57..000000000
--- a/examples/tutorial/plan_tutorial.py
+++ /dev/null
@@ -1,395 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import argparse
-import ark
-import time
-import json
-import numpy as np
-from dataclasses import dataclass
-from typing import Optional
-from pathlib import Path
-
-
-@dataclass
-class ModelArgs:
-    dim: int = 4096
-    n_layers: int = 32
-    n_heads: int = 32
-    n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
-    multiple_of: int = (
-        256  # make SwiGLU hidden layer size multiple of large power of 2
-    )
-    ffn_dim_multiplier: Optional[float] = None
-    norm_eps: float = 1e-5
-    max_batch_size: int = 32
-    max_seq_len: int = 2048
-
-
-class ColumnParallelLinear(ark.Module):
-    """Linear layer with column parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its second dimension as A = [A_1, ..., A_p].
-    Here the weight = A^T, so we need to partition the weight matrix along
-    its first dimension.
-
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        dtype: np.dtype,
-        gather_output: bool = True,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.dtype = dtype
-        self.local_rank = local_rank
-        self.world_size = world_size
-        self.gather_output = gather_output
-
-        self.weight = ark.parameter(
-            [out_dim // world_size, in_dim], ark.DataType.from_numpy(dtype)
-        )
-        self.data = None
-
-    def forward(self, x):
-        if self.world_size == 1 or self.gather_output == False:
-            return ark.matmul(x, self.weight, transpose_other=True)
-        # We need to concat the output_tensor_shards along the last dimension
-        output_tensor = ark.tensor(
-            [x.shape()[0], x.shape()[1], self.out_dim],
-            ark.DataType.from_numpy(self.dtype),
-        )
-        output_tensor_shards = ark.sharding(
-            output_tensor,
-            axis=2,
-            dim_per_shard=self.out_dim // self.world_size,
-        )
-        local_result = ark.identity(
-            output_tensor_shards[self.local_rank], deps=output_tensor_shards
-        )
-        # (batch_size, seq_len, out_dim // world_size)
-        local_result = ark.matmul(
-            x, self.weight, local_result, transpose_other=True
-        )
-        gather_input = ark.identity(output_tensor, deps=[local_result])
-        # return gather_input
-        gather_reshape = ark.reshape(
-            gather_input, [x.shape()[0] * x.shape()[1], self.out_dim]
-        )
-        gather_out = ark.local_all_gather(
-            gather_reshape, self.local_rank, self.world_size, 1
-        )
-        return ark.reshape(
-            gather_out, [x.shape()[0], x.shape()[1], self.out_dim]
-        )
-
-    def initialize(self):
-        if self.data is None:
-            data = np.random.uniform(
-                low=-0.1, high=0.1, size=self.weight.shape()
-            ).astype(self.dtype)
-            self.data = data
-        self.weight.from_numpy(self.data)
-
-
-class RowParallelLinear(ark.Module):
-    """Linear layer with row parallelism.
-
-    The linear layer is defined as Y = XA + b. A is parallelized along
-    its first dimension and X along its second dimension as:
-               -   -
-              | A_1 |
-              | .   |
-          A = | .   |        X = [X_1, ..., X_p]
-              | .   |
-              | A_p |
-               -   -
-
-    Here the weight = A^T, so we need to partition the weight matrix along
-    its second dimension.
-    """
-
-    def __init__(
-        self,
-        in_dim: int,
-        out_dim: int,
-        dtype: ark.DataType = ark.fp16,
-        input_is_parallel: bool = False,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        self.in_dim = in_dim
-        self.out_dim = out_dim
-        self.dtype = dtype
-        self.local_rank = local_rank
-        self.world_size = world_size
-        self.input_is_parallel = input_is_parallel
-
-        self.weight = ark.parameter(
-            [out_dim, in_dim // world_size], ark.DataType.from_numpy(self.dtype)
-        )
-        self.data = None
-
-    def forward(self, x):
-        if self.world_size == 1:
-            return ark.matmul(x, self.weight, transpose_other=True)
-        x_ndims = len(x.shape())
-        if self.input_is_parallel:
-            input_parallel = x
-        else:
-            x_shards = ark.sharding(
-                x, x_ndims - 1, self.in_dim // self.world_size
-            )
-            input_parallel = x_shards[self.local_rank]
-        local_result = ark.matmul(
-            input_parallel, self.weight, transpose_other=True
-        )
-        reduced_result = ark.local_all_reduce(
-            local_result, self.local_rank, self.world_size
-        )
-        return reduced_result
-
-    def initialize(self):
-        if self.data is None:
-            data = np.random.uniform(
-                low=-0.1, high=0.1, size=self.weight.shape()
-            ).astype(self.dtype)
-            self.data = data
-        self.weight.from_numpy(self.data)
-
-
-class Silu(ark.Module):
-    """
-    Silu activation function, silu(x) = x * sigmoid(x)
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: ark.Tensor):
-        x1 = ark.sigmoid(x)
-        return ark.mul(x, x1)
-
-
-class FeedForward(ark.Module):
-    def __init__(
-        self,
-        dim: int,
-        hidden_dim: int,
-        multiple_of: int,
-        ffn_dim_multiplier: Optional[float],
-        dtype: np.dtype,
-        local_rank: int = 0,
-        world_size: int = 1,
-    ):
-        super().__init__()
-        hidden_dim = int(2 * hidden_dim / 3)
-        # custom dim factor multiplier
-        if ffn_dim_multiplier is not None:
-            hidden_dim = int(ffn_dim_multiplier * hidden_dim)
-        hidden_dim = multiple_of * (
-            (hidden_dim + multiple_of - 1) // multiple_of
-        )
-
-        self.w1 = ColumnParallelLinear(
-            dim, hidden_dim, dtype, False, local_rank, world_size
-        )
-        self.w2 = RowParallelLinear(
-            hidden_dim, dim, dtype, True, local_rank, world_size
-        )
-        self.w3 = ColumnParallelLinear(
-            dim, hidden_dim, dtype, False, local_rank, world_size
-        )
-
-    def forward(self, x):
-        # self.w2(F.silu(self.w1(x)) * self.w3(x))
-        x1 = self.w1(x)
-        x1 = Silu()(x1)
-        x2 = self.w3(x)
-        x3 = ark.mul(x1, x2)
-        x4 = self.w2(x3)
-        return x4
-
-    def initialize(self):
-        self.w1.initialize()
-        self.w2.initialize()
-        self.w3.initialize()
-
-
-class Input(ark.Module):
-    def __init__(
-        self, batch_size: int, seq_len: int, dim: int, dtype: np.dtype
-    ):
-        super().__init__()
-        self.tensor = ark.tensor(
-            (batch_size, seq_len, dim), ark.DataType.from_numpy(dtype)
-        )
-        self.data = None
-
-    def forward(self):
-        return self.tensor
-
-    def initialize(self):
-        if self.data is None:
-            self.data = np.random.uniform(
-                low=-0.1, high=0.1, size=self.tensor.shape()
-            ).astype(self.tensor.dtype().to_numpy())
-        self.tensor.from_numpy(self.data)
-
-
-def compare_results(result, ground_truth):
-    eps = np.finfo(result.dtype).eps
-    result = result.flatten()
-    ground_truth = ground_truth.flatten()
-
-    max_value_idx = np.argmax(ground_truth)
-    min_value_idx = np.argmin(ground_truth)
-
-    abs_diff = np.abs(result - ground_truth)
-    max_abs_diff_idx = np.argmax(abs_diff)
-    max_abs_diff = abs_diff[max_abs_diff_idx]
-
-    abs_pt = np.abs(ground_truth)
-    rel_diff = abs_diff / (abs_pt + eps)
-    max_rel_diff_idx = np.argmax(rel_diff)
-    max_rel_diff = rel_diff[max_rel_diff_idx]
-
-    # max rel_diff where abs_pt is larger than 1e-3
-    max_rel_diff_3_idx = np.argmax(rel_diff * (abs_pt > 1e-3))
-    max_rel_diff_3 = rel_diff[max_rel_diff_3_idx]
-
-    mean_square_error = np.mean(np.square(result - ground_truth))
-
-    # Test info as string
-
-    print(
-        f"Comparing ground truth vs results\n"
-        f"  max_value: {ground_truth[max_value_idx]} vs {result[max_value_idx]} at index {max_value_idx}\n"
-        f"  min_value: {ground_truth[min_value_idx]} vs {result[min_value_idx]} at index {min_value_idx}\n"
-        f"  max_abs_diff: {max_abs_diff:.4e} ({ground_truth[max_abs_diff_idx]} vs {result[max_abs_diff_idx]} at index {max_abs_diff_idx})\n"
-        f"  max_rel_diff: {max_rel_diff:.4e} ({ground_truth[max_rel_diff_idx]} vs {result[max_rel_diff_idx]} at index {max_rel_diff_idx})\n"
-        f"  max_rel_diff_3: {max_rel_diff_3:.4e} ({ground_truth[max_rel_diff_3_idx]} vs {result[max_rel_diff_3_idx]} at index {max_rel_diff_3_idx})\n"
-        f"  mean_square_error: {mean_square_error:.4e}\n"
-    )
-
-
-def config_rule_larger_tile(op: str, arch: str) -> str:
-    j = json.loads(op)
-    op_type = j["Type"]
-    if op_type == "Sigmoid" or op_type == "Mul":
-        pshape = j["ResultTensors"][0]["PaddedShape"]
-        if len(pshape) < 2 or pshape[-2] % 128 != 0 or pshape[-1] % 256 != 0:
-            return ""
-        num_tasks = pshape[-2] // 128 * pshape[-1] // 256
-        cfg = {
-            "NumWarps": 8,
-            "SramBytes": 0,
-            "Tile": [128, 256],
-            "NumTasks": num_tasks,
-        }
-        return json.dumps(cfg)
-    return ""
-
-
-def main(plan_path: str):
-    args = ModelArgs()
-    batch_size = 1
-    seq_len = 512
-    dtype = np.float16
-    seed = int(time.time())
-
-    print(f"seed: {seed}")
-    np.random.seed(seed)
-    ark.srand(seed)
-
-    InputModule = Input(batch_size, seq_len, args.dim, dtype)
-    input_tensor = InputModule()
-
-    # Declare model
-    FeedForwardModule = FeedForward(
-        dim=args.dim,
-        hidden_dim=4 * args.dim,
-        multiple_of=args.multiple_of,
-        ffn_dim_multiplier=args.ffn_dim_multiplier,
-        dtype=dtype,
-    )
-    output_tensor = FeedForwardModule(input_tensor)
-
-    # Write model.json
-    with open("model.json", "w") as f:
-        f.write(ark.Model.get_model().compress().serialize())
-
-    # Calculate default result
-    ground_truth = None
-    with ark.Runtime.get_runtime() as rt:
-        planner = ark.Planner()
-
-        # If this rule is installed, default planner will perform the same as
-        # `plan_1_larger_tile.json` on A100.
-        # planner.install_config_rule(config_rule_larger_tile)
-
-        plan = planner.plan()
-        with open("default_plan.json", "w") as f:
-            f.write(str(plan))
-        rt.launch(plan=plan)
-
-        # Initialize
-        InputModule.initialize()
-        FeedForwardModule.initialize()
-
-        # Calculate output
-        rt.run()
-        ground_truth = output_tensor.to_numpy()
-
-        # Measure throughput
-        iter = 100
-        ts = time.time()
-        rt.run(iter)
-        elapsed_ms = (time.time() - ts) * 1e3
-        print(
-            f"DefaultPlan elapsed time: total {elapsed_ms:.6f} ms, {elapsed_ms/iter:.6f} ms/iter"
-        )
-
-    # Run `plan_path` file if exists
-    if not Path(plan_path).is_file():
-        print(f"File {plan_path} does not exist. Exiting...")
-        return
-    with ark.Runtime.get_runtime() as rt:
-        rt.launch(plan=ark.Plan.from_file(plan_path))
-
-        # Initialize
-        InputModule.initialize()
-        FeedForwardModule.initialize()
-
-        # Calculate output
-        rt.run()
-        result = output_tensor.to_numpy()
-
-        # Measure throughput
-        iter = 100
-        ts = time.time()
-        rt.run(iter)
-        elapsed_ms = (time.time() - ts) * 1e3
-        print(
-            f"Plan elapsed time: total {elapsed_ms:.6f} ms, {elapsed_ms/iter:.6f} ms/iter"
-        )
-
-    # Compare results
-    compare_results(result, ground_truth)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--plan_path", type=str, default="plan.json")
-
-    args = parser.parse_args()
-    main(args.plan_path)
diff --git a/examples/tutorial/planner_tutorial.py b/examples/tutorial/planner_tutorial.py
index 6153aaf8e..8702f8929 100644
--- a/examples/tutorial/planner_tutorial.py
+++ b/examples/tutorial/planner_tutorial.py
@@ -54,14 +54,14 @@ def eval(tensor: ark.Tensor):
         return tensor.to_torch()
 
 
-def perf():
+def perf(num_iter: int = 1000):
     with ark.Runtime() as rt:
         rt.launch()
 
         start = time.time()
-        rt.run(iter=1000)
+        rt.run(iter=num_iter)
         end = time.time()
-        return (end - start) / 1000
+        return (end - start) / num_iter
 
 
 if __name__ == "__main__":
diff --git a/examples/tutorial/quickstart_tutorial.py b/examples/tutorial/quickstart_tutorial.py
index ebd3f8530..1fce51452 100644
--- a/examples/tutorial/quickstart_tutorial.py
+++ b/examples/tutorial/quickstart_tutorial.py
@@ -41,12 +41,6 @@ def quickstart_tutorial():
         output_tensor_host, input_tensor_host + other_tensor_host
     )
 
-    # Stop the ARK runtime (undo Runtime.launch())
-    runtime.stop()
-
-    # Reset the ARK runtime (free all resources)
-    runtime.reset()
-
     print("Quickstart tutorial is successful!")
 
 
diff --git a/python/ark/executor.py b/python/ark/executor.py
new file mode 100644
index 000000000..14f0817a8
--- /dev/null
+++ b/python/ark/executor.py
@@ -0,0 +1,26 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from .core import CoreExecutor
+
+
+__all__ = ["Executor"]
+
+
+class ExecutorState:
+    executor: CoreExecutor = None
+
+
+class Executor:
+    @staticmethod
+    def get() -> CoreExecutor:
+        if ExecutorState.executor is None:
+            ExecutorState.executor = CoreExecutor()
+        return ExecutorState.executor
+
+    @staticmethod
+    def reset() -> None:
+        if ExecutorState.executor is None:
+            return
+        ExecutorState.executor.destroy()
+        ExecutorState.executor = None
diff --git a/python/ark/init.py b/python/ark/init.py
index bc10f1c93..07eb557b3 100644
--- a/python/ark/init.py
+++ b/python/ark/init.py
@@ -3,15 +3,13 @@
 
 from . import core
 from .model import Model
-from .runtime import RuntimeState
+from .executor import Executor
 
 __all__ = ["init"]
 
 
 def init():
     """Initializes ARK."""
+    Executor.reset()
     Model.reset()
-    if RuntimeState.runtime is not None:
-        del RuntimeState.runtime
-        RuntimeState.runtime = None
     core.init()
diff --git a/python/ark/model.py b/python/ark/model.py
index a3073c24b..e103d4083 100644
--- a/python/ark/model.py
+++ b/python/ark/model.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 from typing import NewType
+from . import log
 from .core import CoreModel
 
 
@@ -39,7 +40,7 @@ def get_device_id():
         """
         Get the device id.
         """
-        return _ModelState.device_id
+        return ModelState.device_id
 
     @staticmethod
     def set_rank(rank: int):
@@ -61,8 +62,8 @@ def set_device_id(device_id: int):
         Set the device id.
         """
         if device_id < 0:
-            raise ValueError("device_id must be non-negative")
-        _ModelState.device_id = device_id
+            raise log.InvalidUsageError("device_id must be non-negative")
+        ModelState.device_id = device_id
 
     @staticmethod
     def reset():
diff --git a/python/ark/module.py b/python/ark/module.py
index 368f36cf7..9e06bcacf 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -1,7 +1,6 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import logging
 import numpy as np
 from typing import Any, Dict
 from .tensor import Parameter
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 7111e3958..2ca735a2f 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -1,11 +1,11 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-import logging
 from enum import Enum
 
+from . import log
 from .torch import torch
-from .core import CoreExecutor
+from .executor import Executor
 from .planner import Planner, Plan
 from .model import Model
 from typing import Dict
@@ -14,24 +14,14 @@
 __all__ = ["Runtime"]
 
 
-class RuntimeState:
-    """
-    The RuntimeState class is used to store the state of the model.
-    """
-
-    runtime = None
-
-
 class Runtime:
     """
     Convenience class for running a model.
     """
 
-    _loop_mode: bool = True
-
-    class State(Enum):
+    class StateCode(Enum):
         """
-        Runtime states.
+        Runtime state code.
         """
 
         Init = 0
@@ -39,41 +29,34 @@ class State(Enum):
         Running = 2
 
     def __init__(self):
-        self.executor: CoreExecutor = CoreExecutor()
-        self.state: Runtime.State = Runtime.State.Init
-        self.loop_mode = True
-        RuntimeState.runtime = self
+        self.loop_mode: bool = True
+        self.state: Runtime.StateCode = Runtime.StateCode.Init
 
-    @staticmethod
-    def get_runtime() -> "Runtime":
-        """
-        Get the runtime.
-        If the runtime does not exist, create a new runtime.
-        """
-        if RuntimeState.runtime is None:
-            RuntimeState.runtime = Runtime()
-        return RuntimeState.runtime
-
-    def __enter__(self):
+    def __enter__(self) -> "Runtime":
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
-        self.reset()
+        if self.launched():
+            self.stop()
+
+    def __del__(self):
+        if self.launched():
+            self.stop()
 
     def launched(self) -> bool:
         """
         Check if the runtime is launched.
         """
         return (
-            self.state == Runtime.State.LaunchedNotRunning
-            or self.state == Runtime.State.Running
+            self.state == Runtime.StateCode.LaunchedNotRunning
+            or self.state == Runtime.StateCode.Running
         )
 
     def running(self) -> bool:
         """
         Check if the runtime is running.
         """
-        return self.state == Runtime.State.Running
+        return self.state == Runtime.StateCode.Running
 
     def launch(
         self,
@@ -91,8 +74,7 @@ def launch(
         if device_id == -1:
             device_id = Model.get_device_id()
         elif device_id < 0:
-            logging.error(f"Invalid device_id: {device_id}")
-            raise ValueError(f"Invalid device_id: {device_id}")
+            raise log.InvalidUsageError(f"Invalid device_id: {device_id}")
         plan = Planner(device_id).plan() if plan is None else plan
         plan_str = str(plan)
         if self.launched():
@@ -101,40 +83,37 @@ def launch(
         for ark_tensor in list(tensor_mappings.keys()):
             torch_tensor = tensor_mappings[ark_tensor]
             if not isinstance(torch_tensor, torch.Tensor):
-                raise ValueError("Must bind PyTorch tensor")
+                raise log.InvalidUsageError("Must bind PyTorch tensor")
             internal_ark_tensor = ark_tensor._tensor
             tensor_mappings[internal_ark_tensor] = torch_tensor.data_ptr()
             del tensor_mappings[ark_tensor]
         # Recompile if the previous launch was not compiled with the same info
         # or if this is the first launch
-        if (
-            plan_str != self.executor.plan()
-            or device_id != self.executor.device_id()
-        ):
-            self.executor.compile(plan_str, device_id)
-        self.executor.launch(tensor_mappings, stream, loop_mode)
-        self.state = Runtime.State.LaunchedNotRunning
-        Runtime._loop_mode = loop_mode
+        exe = Executor.get()
+        if plan_str != exe.plan() or device_id != exe.device_id():
+            exe.compile(plan_str, device_id)
+        exe.launch(tensor_mappings, stream, loop_mode)
+        self.state = Runtime.StateCode.LaunchedNotRunning
+        self.loop_mode = loop_mode
 
     def run(self, iter=1, non_blocking=False, tensor_mappings={}):
         """
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """
-        if Runtime._loop_mode and tensor_mappings:
-            raise ValueError(
+        if self.loop_mode and tensor_mappings:
+            raise log.InvalidUsageError(
                 "`loop_mode` argument when calling `runtime.launch` "
                 "must be set to false in order to pass non-empty "
                 "tensor mappings in `runtime.run`."
             )
-        if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error(f"ARK runtime is not launched")
-            raise RuntimeError(f"ARK runtime is not launched")
-        self.state = Runtime.State.Running
+        if self.state != Runtime.StateCode.LaunchedNotRunning:
+            raise log.InvalidUsageError(f"ARK runtime is not launched")
+        self.state = Runtime.StateCode.Running
         ph_map = {}
         for ark_tensor in list(tensor_mappings.keys()):
             t = tensor_mappings[ark_tensor]
             ph_map[ark_tensor._tensor] = t.data_ptr()
-        self.executor.run(iter, ph_map)
+        Executor.get().run(iter, ph_map)
         if not non_blocking:
             self.wait()
 
@@ -142,20 +121,19 @@ def barrier(self):
         """
         Barrier for all ranks.
         """
-        if self.state != Runtime.State.LaunchedNotRunning:
-            logging.error("ARK runtime is not launched")
-            raise RuntimeError("ARK runtime is not launched")
-        self.executor.barrier()
+        if self.state != Runtime.StateCode.LaunchedNotRunning:
+            raise log.InvalidUsageError("ARK runtime is not launched")
+        Executor.get().barrier()
 
     def wait(self):
         """
         Wait for the kernel to finish.
         """
-        if self.state != Runtime.State.Running:
-            logging.warning(f"ARK runtime is not running, skip waiting")
+        if self.state != Runtime.StateCode.Running:
+            log.WARN(f"ARK runtime is not running, skip waiting")
             return
-        self.executor.wait()
-        self.state = Runtime.State.LaunchedNotRunning
+        Executor.get().wait()
+        self.state = Runtime.StateCode.LaunchedNotRunning
 
     def stop(self) -> float:
         """
@@ -163,18 +141,8 @@ def stop(self) -> float:
         Once this is called, we need to call `launch()` again to run the model again.
         """
         if not self.launched():
-            logging.warning(f"ARK runtime is never launched, skip stopping")
+            log.WARN(f"ARK runtime is never launched, skip stopping")
             return
-        elapsed = self.executor.stop()
-        self.state = Runtime.State.LaunchedNotRunning
+        elapsed = Executor.get().stop()
+        self.state = Runtime.StateCode.LaunchedNotRunning
         return elapsed
-
-    def reset(self):
-        """
-        Reset the runtime.
-        """
-        if self.launched():
-            self.stop()
-        self.executor.destroy()
-        self.executor = _Executor()
-        self.state = Runtime.State.Init
diff --git a/python/ark/serialize.py b/python/ark/serialize.py
index 93473202e..584111825 100644
--- a/python/ark/serialize.py
+++ b/python/ark/serialize.py
@@ -2,7 +2,7 @@
 # Licensed under the MIT license.
 
 import pickle
-import logging
+from . import log
 
 
 def save(state_dict, state_dict_file_path: str):
@@ -10,9 +10,7 @@ def save(state_dict, state_dict_file_path: str):
     Save the state_dict of a module to a file
     """
     if not isinstance(state_dict, dict):
-        logging.warn(
-            "Warning: Invalid state_dict saved to", state_dict_file_path
-        )
+        log.WARN(f"Invalid state_dict saved to {state_dict_file_path}")
     with open(state_dict_file_path, "wb") as f:
         pickle.dump(state_dict, f)
 
@@ -24,5 +22,5 @@ def load(state_dict_file_path: str):
     with open(state_dict_file_path, "rb") as f:
         state_dict = pickle.load(f)
         if not isinstance(state_dict, dict):
-            logging.warn("Warning: Invalid state_dict file")
+            log.WARN("Invalid state_dict file")
         return state_dict
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index c6250b953..f876f8918 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -4,10 +4,11 @@
 import numpy as np
 from typing import Callable, Iterable, List, Union, Type
 
+from . import log
 from .core import CoreDims, CoreTensor, NullTensor
 from .torch import torch, _no_torch
 from .data_type import DataType, fp32
-from .runtime import Runtime
+from .executor import Executor
 from .model import Model
 
 __all__ = ["Dims", "Tensor", "Parameter", "NullTensor"]
@@ -92,12 +93,7 @@ def data_ptr(self) -> int:
         """
         Returns the underlying data pointer.
         """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "`Tensor.data_ptr()` is usable only after you call `Runtime.launch()`."
-            )
-        return rt.executor.tensor_address(self._tensor)
+        return Executor.get().tensor_address(self._tensor)
 
     def is_external(self) -> bool:
         """
@@ -105,6 +101,22 @@ def is_external(self) -> bool:
         """
         return self._tensor.is_external()
 
+    def _raise_if_no_data(self):
+        if self.data_ptr() != 0:
+            return
+        if self.is_external():
+            raise log.InvalidUsageError(
+                "Tried to access data of an external tensor that does not "
+                "have data set. This is likely because this tensor is a "
+                "placeholder and you have not set the data."
+            )
+        raise log.InvalidUsageError(
+            "Tried to access data of a tensor that is not allocated yet. "
+            "This is likely due to either you have not called "
+            "`Runtime.launch()` for the model or the tensor is unused "
+            "in the model."
+        )
+
     def to_numpy(
         self, ndarray: np.ndarray = None, stream: int = 0
     ) -> np.ndarray:
@@ -113,74 +125,66 @@ def to_numpy(
         a new numpy array will be created. If the tensor is not allocated,
         an empty numpy array without the data buffer will be returned.
         """
+        self._raise_if_no_data()
         np_type = self.dtype().to_numpy()
         if np_type is None:
-            raise ValueError(
+            raise log.InvalidUsageError(
                 f"Tensor data type {self.dtype().__name__} is not supported by numpy."
             )
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.to_numpy()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        elif ndarray is None:
+        if ndarray is None:
             ndarray = np.zeros(self.shape(), dtype=np_type)
         elif not ndarray.flags["C_CONTIGUOUS"]:
-            raise ValueError("ndarray is not contiguous in memory")
+            raise log.InvalidUsageError("ndarray is not contiguous in memory")
         elif ndarray.shape != self.shape():
-            raise ValueError("ndarray shape does not match the tensor")
+            raise log.InvalidUsageError(
+                "ndarray shape does not match the tensor"
+            )
         elif ndarray.dtype != np_type:
-            raise ValueError("ndarray dtype does not match the tensor")
+            raise log.InvalidUsageError(
+                "ndarray dtype does not match the tensor"
+            )
         elif ndarray.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_read(self._tensor, ndarray, stream)
+            raise log.InvalidUsageError(
+                "ndarray size does not match the tensor"
+            )
+        Executor.get().tensor_read(self._tensor, ndarray, stream)
         return ndarray
 
     def from_numpy(self, ndarray: np.ndarray, stream: int = 0) -> "Tensor":
         """
         Copies the tensor from a host numpy array to the device.
         """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.from_numpy()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
+        self._raise_if_no_data()
         ndarray = ndarray.astype(self.dtype().to_numpy())
         if not ndarray.flags["C_CONTIGUOUS"]:
             ndarray = np.ascontiguousarray(ndarray)
         if ndarray.nbytes != self.nelems() * self.dtype().element_size():
-            raise ValueError("ndarray size does not match the tensor")
-        rt.executor.tensor_write(self._tensor, ndarray, stream)
+            raise log.InvalidUsageError(
+                "ndarray size does not match the tensor"
+            )
+        Executor.get().tensor_write(self._tensor, ndarray, stream)
         return self
 
     def to_dlpack(self):
         """
         Returns a DLPack tensor that shares the same memory with the device tensor.
         """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.to_dlpack()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
-        return rt.executor.tensor_to_dlpack(self._tensor)
+        self._raise_if_no_data()
+        return Executor.get().tensor_to_dlpack(self._tensor)
 
     @staticmethod
     def from_dlpack(ext_tensor) -> "Tensor":
         """
         Copies the tensor from a DLPack tensor to the device.
         """
-        # return Tensor(_Tensor(ext_tensor))
-        raise NotImplementedError("from_dlpack is not implemented yet")
+        raise log.UnsupportedError("from_dlpack is not implemented yet")
 
     def to_torch(self) -> torch.Tensor:
         """
         Returns a torch tensor that shares the same memory with the device tensor.
         """
         if _no_torch:
-            raise ImportError("torch is not available")
+            raise log.SystemError("torch is not available")
         dl_capsule = self.to_dlpack()
         torch_view = torch.utils.dlpack.from_dlpack(dl_capsule)
         # Keep dl_capsule alive not to free the memory
@@ -193,11 +197,11 @@ def from_torch(tensor: torch.Tensor) -> "Tensor":
         Returns an ARK tensor that shares the same memory with the torch tensor.
         """
         if _no_torch:
-            raise ImportError("torch is not available")
+            raise log.SystemError("torch is not available")
         elif not tensor.is_contiguous():
-            raise ValueError("Torch tensor must be contiguous.")
+            raise log.InvalidUsageError("Torch tensor must be contiguous.")
         elif tensor.device.type == "cpu":
-            raise ValueError("Torch tensor must be on a device.")
+            raise log.InvalidUsageError("Torch tensor must be on a device.")
         # TODO: support strides and offsets
         ark_tensor = Tensor(
             _cpp_tensor(
@@ -217,19 +221,16 @@ def copy(
         Copies data into this tensor. The data type may differ,
         but the size must match.
         """
-        rt = Runtime.get_runtime()
-        if not rt.launched():
-            raise RuntimeError(
-                "Tensor is not allocated yet. `Tensor.copy()` is "
-                "usable only after you call `Runtime.launch()`."
-            )
+        self._raise_if_no_data()
         tensor_bytes = self.nelems() * self.dtype().element_size()
         if isinstance(data, torch.Tensor):
             if not data.is_contiguous():
                 data = data.contiguous()
             if data.numel() * data.element_size() != tensor_bytes:
-                raise ValueError("data size does not match the tensor")
-            rt.executor.tensor_write(
+                raise log.InvalidUsageError(
+                    "data size does not match the tensor"
+                )
+            Executor.get().tensor_write(
                 self._tensor,
                 data.data_ptr(),
                 tensor_bytes,
@@ -243,10 +244,14 @@ def copy(
             if not data.flags["C_CONTIGUOUS"]:
                 data = np.ascontiguousarray(data)
             if data.nbytes != tensor_bytes:
-                raise ValueError("data size does not match the tensor")
-            rt.executor.tensor_write(self._tensor, data, stream)
+                raise log.InvalidUsageError(
+                    "data size does not match the tensor"
+                )
+            Executor.get().tensor_write(self._tensor, data, stream)
         else:
-            raise ValueError("data must be a numpy array or a torch tensor")
+            raise log.InvalidUsageError(
+                "data must be a numpy array or a torch tensor"
+            )
         return self
 
     def initialize(self) -> "Tensor":
@@ -284,13 +289,13 @@ def __init__(
                 _tensor,
                 requires_grad=tensor.requires_grad,
             )
-        elif isinstance(tensor, _Tensor):
+        elif isinstance(tensor, CoreTensor):
             _tensor = tensor
             self.torch_param = None
             self.staged_tensor = None
             Tensor.__init__(self, _tensor, requires_grad=False)
         else:
-            raise TypeError(
+            raise log.InvalidUsageError(
                 "tensor must be an ARK tensor or a torch.nn.Parameter"
             )
 
@@ -299,15 +304,19 @@ def update_gradient(self, ark_tensor: Tensor):
         Stages an ARK tensor to be used for updating the gradient of its associated parameter.
         """
         if _no_torch:
-            raise ImportError("torch is not available")
+            raise log.SystemError("torch is not available")
         if self.torch_param is None:
-            raise ValueError(
+            raise log.InvalidUsageError(
                 "there is no PyTorch parameter associated with this ARK parameter"
             )
         if not self.torch_param.requires_grad:
-            raise ValueError("parameter does not require gradient updates")
+            raise log.InvalidUsageError(
+                "parameter does not require gradient updates"
+            )
         if ark_tensor is None or not isinstance(ark_tensor, Tensor):
-            raise ValueError("cannot use non-ARK tensor to update ARK gradient")
+            raise log.InvalidUsageError(
+                "cannot use non-ARK tensor to update ARK gradient"
+            )
         self.staged_tensor = ark_tensor
 
 
@@ -326,13 +335,21 @@ def _cpp_tensor(
     name: str = "",
 ) -> Tensor:
     if not _is_list_or_tuple(shape):
-        raise ValueError("shape should be a list or tuple of integers")
+        raise log.InvalidUsageError(
+            "shape should be a list or tuple of integers"
+        )
     if not _is_list_or_tuple(strides):
-        raise ValueError("strides should be a list or tuple of integers")
+        raise log.InvalidUsageError(
+            "strides should be a list or tuple of integers"
+        )
     if not _is_list_or_tuple(offsets):
-        raise ValueError("offsets should be a list or tuple of integers")
+        raise log.InvalidUsageError(
+            "offsets should be a list or tuple of integers"
+        )
     if not _is_list_or_tuple(padded_shape):
-        raise ValueError("padded_shape should be a list or tuple of integers")
+        raise log.InvalidUsageError(
+            "padded_shape should be a list or tuple of integers"
+        )
     # only support tensors with up to 4 dimensions
     if (
         len(shape) > 4
diff --git a/python/unittest/test_runtime.py b/python/unittest/test_runtime.py
index a04f193cf..969f6140e 100644
--- a/python/unittest/test_runtime.py
+++ b/python/unittest/test_runtime.py
@@ -7,11 +7,12 @@
 
 @pytest_ark()
 def test_runtime_empty():
-    with ark.Runtime.get_runtime() as rt:
+    with ark.Runtime() as rt:
         rt.launch()
         rt.run()
         rt.stop()
 
+
 @pytest_ark()
 def test_runtime_init():
     M, N = 64, 64
@@ -42,7 +43,6 @@ def test_runtime_init():
     np.testing.assert_allclose(
         final_output_host, output_tensor_host + new_tensor_host
     )
-    runtime.reset()
 
 
 @pytest_ark()
@@ -70,4 +70,3 @@ def test_runtime_reuse_plans():
     np.testing.assert_allclose(
         output_tensor_host, input_tensor_host + other_tensor_host
     )
-    runtime.reset()
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
index 213264e3b..799c1f60f 100644
--- a/python/unittest/test_tensor.py
+++ b/python/unittest/test_tensor.py
@@ -1,7 +1,7 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from unittest_common import ark, pytest_ark
+from common import ark, pytest_ark
 
 
 @pytest_ark(need_torch=True)

From 19d56dd8be064630774b5756abfd2973e9c7218f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 6 Sep 2024 05:29:06 +0000
Subject: [PATCH 084/106] updates

---
 python/ark/runtime.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index 2ca735a2f..af1eb995e 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -4,6 +4,7 @@
 from enum import Enum
 
 from . import log
+from .tensor import Tensor
 from .torch import torch
 from .executor import Executor
 from .planner import Planner, Plan
@@ -96,7 +97,12 @@ def launch(
         self.state = Runtime.StateCode.LaunchedNotRunning
         self.loop_mode = loop_mode
 
-    def run(self, iter=1, non_blocking=False, tensor_mappings={}):
+    def run(
+        self,
+        iter: int = 1,
+        non_blocking: bool = False,
+        tensor_mappings: Dict[Tensor, torch.Tensor] = {},
+    ):
         """
         Run the ARK program for iter iterations and wait for the kernel to finish.
         """

From d891914c1e1dd3d46b93b81c84b6d61e062967b5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 17 Sep 2024 21:12:05 +0000
Subject: [PATCH 085/106] codegen fix wip

---
 ark/codegen.cpp | 128 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 84 insertions(+), 44 deletions(-)

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 23045b0c7..7ab2f5635 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -60,7 +60,7 @@ class CodeGenerator::Impl {
     ~Impl() = default;
 
    private:
-    std::string def_op(const Json &op_json, size_t task_id, size_t op_idx);
+    std::pair<std::string, size_t> def_op(const Json &op_json);
 
     std::string def_task(const Json &task_json);
 
@@ -80,6 +80,8 @@ class CodeGenerator::Impl {
    protected:
     friend class CodeGenerator;
 
+    std::set<size_t> op_hashes_;
+    std::set<size_t> task_hashes_;
     std::map<size_t, size_t> buffer_id_to_offset_;
     std::set<size_t> extra_buffer_ids_;
     std::string name_;
@@ -183,7 +185,10 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     const std::string &template_path =
         ark_root + "/include/kernels/kernel_template.in";
     if (!is_file(template_path)) {
-        ERR(InternalError, "kernel template file not found: ", template_path);
+        ERR(InvalidUsageError,
+            "kernel template file not found: ", template_path,
+            ". Please make sure the ARK_ROOT environment variable is set "
+            "correctly.");
     }
 
     // Generate the global arguments
@@ -224,92 +229,126 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
     code_ = replace(template_code, replacements);
 }
 
-std::string CodeGenerator::Impl::def_op(const Json &op_json, size_t task_id,
-                                        size_t op_idx) {
+std::pair<std::string, size_t> CodeGenerator::Impl::def_op(
+    const Json &op_json) {
     auto op = ModelOp::deserialize(op_json);
     auto impl_name = op->impl_name(op_json["Config"]);
     auto impl_args = op->impl_args(op_json["Config"]);
-    std::stringstream ss;
-    ss << "__forceinline__ __device__ void t" << task_id << "_o" << op_idx
-       << "(";
+    std::stringstream ss_desc;
     size_t arg_idx = 0;
     for (auto &arg : impl_args) {
         if (arg.type_name() == "TENSOR") {
             auto tns = arg.value<ModelTensorRef>();
-            ss << tns->data_type()->type_str() << "*";
+            ss_desc << tns->data_type()->type_str() << "*";
         } else if (arg.type_name() == "OFFSET") {
-            ss << "uint64_t";
+            ss_desc << "uint64_t";
         } else {
-            ss << arg.type_str();
+            ss_desc << arg.type_str();
         }
-        ss << " _" << arg_idx++ << ", ";
+        ss_desc << " _" << arg_idx++ << ", ";
     }
-    ss << "int _idx, int _spw) {\n  " << impl_name << "(";
+    ss_desc << "int _idx, int _spw) {\n  " << impl_name << "(";
     for (size_t i = 0; i < impl_args.size(); ++i) {
-        ss << "_" << i << ", ";
+        ss_desc << "_" << i << ", ";
     }
-    ss << "_idx, _spw);\n}\n";
-    return ss.str();
+    ss_desc << "_idx, _spw);\n}\n";
+    auto desc_str = ss_desc.str();
+    size_t op_hash = std::hash<std::string>{}(desc_str);
+    std::stringstream ss;
+    ss << "__forceinline__ __device__ void __op_" << std::hex << op_hash
+       << std::dec << "(";
+    ss << desc_str;
+    return {ss.str(), op_hash};
 }
 
 std::string CodeGenerator::Impl::def_task(const Json &task_json) {
     std::stringstream ss;
-    size_t op_idx = 0;
+    std::stringstream ss_hash_concat;
+    std::vector<size_t> op_hash_list;
     for (auto &op_json : task_json["Ops"]) {
-        ss << this->def_op(op_json, task_json["Id"], op_idx++);
+        auto [def_str, hash] = this->def_op(op_json);
+        if (op_hashes_.find(hash) == op_hashes_.end()) {
+            ss << def_str;
+            op_hashes_.insert(hash);
+        }
+        ss_hash_concat << std::hex << hash;
+        op_hash_list.push_back(hash);
     }
-    ss << "__device__ void t" << task_json["Id"]
-       << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
+    size_t task_hash = std::hash<std::string>{}(ss_hash_concat.str());
+    std::stringstream ss_desc;
     auto &buf_reg = BufferRegistry::get_instance();
-    op_idx = 0;
+    size_t op_idx = 0;
+    std::map<std::string, size_t> ptr_str_to_index;
+    std::vector<std::string> ptr_str_list;
     for (auto &op_json : task_json["Ops"]) {
         auto op = ModelOp::deserialize(op_json);
         auto impl_args = op->impl_args(op_json["Config"]);
-        ss << "  t" << task_json["Id"] << "_o" << op_idx++ << "(";
-        for (size_t i = 0; i < impl_args.size(); ++i) {
-            auto &arg = impl_args[i];
+        ss_desc << "  __op_" << std::hex << op_hash_list[op_idx++] << std::dec
+                << "(";
+        for (auto &arg : impl_args) {
             if (arg.type_name() == "TENSOR") {
                 auto tns = arg.value<ModelTensorRef>();
                 size_t buffer_id = tns->buffer()->id();
                 auto it = buffer_id_to_offset_.find(buffer_id);
                 auto buf_info = buf_reg.get(buffer_id);
+                std::string ptr_str;
                 if ((buf_info && buf_info->is_external) ||
                     (it == buffer_id_to_offset_.end())) {
-                    ss << "(" << tns->data_type()->type_str() << "*)_ext_buf_"
-                       << buffer_id;
+                    ptr_str = "_ext_buf_" + std::to_string(buffer_id);
                 } else {
                     size_t buffer_offset;
                     buffer_offset = it->second;
                     size_t offset = buffer_offset + ModelOffset(tns).value();
-                    ss << "(" << tns->data_type()->type_str() << "*)&_buf["
-                       << offset << "]";
+                    ptr_str = "&_buf[" + std::to_string(offset) + "]";
                 }
+                size_t ptr_idx;
+                if (ptr_str_to_index.find(ptr_str) == ptr_str_to_index.end()) {
+                    ptr_idx = ptr_str_to_index.size();
+                    ptr_str_to_index[ptr_str] = ptr_idx;
+                    ptr_str_list.push_back(ptr_str);
+                } else {
+                    ptr_idx = ptr_str_to_index[ptr_str];
+                }
+                ss_desc << "(" << tns->data_type()->type_str() << "*)_"
+                        << ptr_idx;
             } else if (arg.type_name() == "OFFSET") {
                 auto moff = arg.value<ModelOffset>();
                 size_t buffer_id = moff.buffer_id();
                 auto buf_info = buf_reg.get(buffer_id);
                 if (buf_info && buf_info->is_external) {
-                    size_t offset = moff.value();
-                    ss << "(uint64_t)((char*)_ext_buf_" << buffer_id << " + "
-                       << offset << ")";
-                } else {
-                    size_t buffer_offset;
-                    auto it = buffer_id_to_offset_.find(buffer_id);
-                    if (it == buffer_id_to_offset_.end()) {
-                        ERR(InternalError, "buffer ID not found: ", buffer_id);
-                    }
-                    buffer_offset = it->second;
-                    size_t offset = buffer_offset + moff.value();
-                    ss << offset;
+                    ERR(InternalError, "cannot offset external buffer");
+                }
+                size_t buffer_offset;
+                auto it = buffer_id_to_offset_.find(buffer_id);
+                if (it == buffer_id_to_offset_.end()) {
+                    ERR(InternalError, "buffer ID not found: ", buffer_id);
                 }
+                buffer_offset = it->second;
+                size_t offset = buffer_offset + moff.value();
+                ss_desc << offset;
             } else {
-                ss << arg.serialize().begin().value();
+                ss_desc << arg.serialize().begin().value();
             }
-            ss << ", ";
+            ss_desc << ", ";
         }
-        ss << "_idx, _spw);\n";
+        ss_desc << "_idx, _spw);\n";
     }
-    ss << "}\n";
+    if (task_hashes_.find(task_hash) == task_hashes_.end()) {
+        ss << "__device__ void __task_" << std::hex << task_hash << std::dec
+           << "(";
+        for (size_t i = 0; i < ptr_str_list.size(); ++i) {
+            ss << "void *_" << i << ", ";
+        }
+        ss << "int _idx, int _spw) {\n" << ss_desc.str() << "}\n";
+        task_hashes_.insert(task_hash);
+    }
+    ss << "__forceinline__ __device__ void __t" << task_json["Id"]
+       << "(char *_buf, int _idx, int _spw, @GLOBAL_ARGS@) {\n";
+    ss << "  __task_" << std::hex << task_hash << std::dec << "(";
+    for (auto &ptr_str : ptr_str_list) {
+        ss << ptr_str << ", ";
+    }
+    ss << "_idx, _spw);\n}\n";
     return ss.str();
 }
 
@@ -332,7 +371,8 @@ std::string CodeGenerator::Impl::task_seq(
     ss << "task_seq<" << proc_b << ", " << proc_e << ", " << proc_s << ", "
        << proc_cur << ", " << task_b << ", " << task_e << ", " << task_s << ", "
        << task_gran << ", " << num_slots << ", " << slot_num_warps << ", "
-       << slot_sram_bytes << ", t" << task_id << ">(_buf, @FUNCTION_ARGS@);\n";
+       << slot_sram_bytes << ", __t" << task_id
+       << ">(_buf, @FUNCTION_ARGS@);\n";
     return ss.str();
 }
 

From 706a99509a560bfee260300fb69adc4231ff4393 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sun, 22 Sep 2024 07:02:56 +0000
Subject: [PATCH 086/106] updates

---
 ark/api/planner.cpp                |  34 +++-
 ark/include/kernels/gemm_ck.h      |   7 +-
 ark/include/kernels/gemm_cutlass.h |  11 +-
 ark/include/kernels/matmul.h       |  12 +-
 ark/model/model_op.hpp             |   2 +-
 ark/ops/ops_arithmetic_test.cpp    |  16 ++
 ark/ops/ops_broadcast.cpp          |   6 +-
 ark/ops/ops_embedding.cpp          |   6 +-
 ark/ops/ops_matmul.cpp             |  56 ++----
 ark/ops/ops_reshape.cpp            |  45 +++--
 ark/ops/ops_reshape_test.cpp       |  38 ++++
 ark/ops/ops_test_common.cpp        |  15 +-
 ark/ops/ops_transpose.cpp          |  18 +-
 ark/ops/ops_transpose_test.cpp     | 119 +++++++++++
 docs/plan_file.md                  |  33 ++--
 examples/llama/model.py            | 306 +++++++++++++++++++++++++----
 examples/llama/model_test.py       |  20 +-
 examples/llama/test.py             |  56 ++++++
 python/ark/data_type.py            |   7 +-
 python/ark/module.py               |   3 +
 python/ark/ops.py                  |   4 +-
 python/ark/tensor.py               |   2 +-
 22 files changed, 661 insertions(+), 155 deletions(-)
 create mode 100644 examples/llama/test.py

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index d36f33cbe..e1dce34ac 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -11,6 +11,7 @@
 #include "model/model_json.hpp"
 #include "model/model_node.hpp"
 #include "model/model_op.hpp"
+#include "model/model_tensor.hpp"
 #include "range.hpp"
 
 namespace ark {
@@ -166,11 +167,40 @@ std::string Planner::Impl::plan(bool pretty) const {
             config = op->default_config(gpu_info.arch);
         }
         check_config_field(op, config, "NumWarps");
-        check_config_field(op, config, "NumTasks");
         check_config_field(op, config, "SramBytes");
         size_t num_warps = config["NumWarps"];
-        size_t num_tasks = config["NumTasks"];
         size_t sram_bytes = config["SramBytes"];
+        size_t num_tasks;
+
+        if (!config.contains("NumTasks")) {
+            std::stringstream ss;
+            ss << "Result shape is not divided by tile. Op: "
+               << op->serialize().dump();
+            auto not_divided_error = ss.str();
+
+            auto &result_tensors = op->result_tensors();
+            if (result_tensors.empty() || !config.contains("Tile")) {
+                num_tasks = 0;
+            } else {
+                const std::vector<DimType> tile_vec = config["Tile"];
+                auto tile = Dims(tile_vec);
+                auto &result_shape = result_tensors[0]->padded_shape();
+                if (result_shape.ndims() < tile.ndims()) {
+                    ERR(PlanError, not_divided_error);
+                }
+                auto tile4 = tile.dims4();
+                auto result_shape4 = result_shape.dims4();
+                num_tasks = 1;
+                for (int i = 0; i < tile4.ndims(); i++) {
+                    if (result_shape4[i] % tile4[i] != 0) {
+                        ERR(PlanError, not_divided_error);
+                    }
+                    num_tasks *= result_shape4[i] / tile4[i];
+                }
+            }
+        } else {
+            num_tasks = config["NumTasks"];
+        }
 
         size_t granularity = config.value("Granularity", 1);
         auto ctx_id = get_context(node, "Id");
diff --git a/ark/include/kernels/gemm_ck.h b/ark/include/kernels/gemm_ck.h
index 4054f2d37..478419691 100644
--- a/ark/include/kernels/gemm_ck.h
+++ b/ark/include/kernels/gemm_ck.h
@@ -376,7 +376,7 @@ template <typename DataTypeA, int LeadingDimA, bool IsColumnA,
           typename DataTypeB, int LeadingDimB, bool IsColumnB,
           typename DataTypeC, int LeadingDimC, int ProblemSizeM,
           int ProblemSizeN, int ProblemSizeK, int TileSizeM, int TileSizeN,
-          int TileSizeK, typename UnitOp>
+          typename UnitOp>
 struct CkGemm {
     static_assert(LeadingDimA >= 0, "");
     static_assert(LeadingDimB >= 0, "");
@@ -386,7 +386,6 @@ struct CkGemm {
     static_assert(ProblemSizeK >= 0, "");
     static_assert(TileSizeM >= 0, "");
     static_assert(TileSizeN >= 0, "");
-    static_assert(TileSizeK >= 0, "");
 
     using AccumulateType = fp32;
 
@@ -514,13 +513,13 @@ template <typename DataTypeA, int LeadingDimA, bool IsColumnA,
           typename DataTypeB, int LeadingDimB, bool IsColumnB,
           typename DataTypeC, int LeadingDimC, int ProblemSizeM,
           int ProblemSizeN, int ProblemSizeK, int TileSizeM, int TileSizeN,
-          int TileSizeK, typename UnitOp>
+          typename UnitOp>
 DEVICE void gemm_ck(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
                     int smem_per_warp) {
     using CkGemm =
         CkGemm<DataTypeA, LeadingDimA, IsColumnA, DataTypeB, LeadingDimB,
                IsColumnB, DataTypeC, LeadingDimC, ProblemSizeM, ProblemSizeN,
-               ProblemSizeK, TileSizeM, TileSizeN, TileSizeK, UnitOp>;
+               ProblemSizeK, TileSizeM, TileSizeN, UnitOp>;
     CkGemm gemm;
     gemm.Run(C, A, B, uop_idx, smem_per_warp);
 }
diff --git a/ark/include/kernels/gemm_cutlass.h b/ark/include/kernels/gemm_cutlass.h
index c5e8c7579..ae13e4c5b 100644
--- a/ark/include/kernels/gemm_cutlass.h
+++ b/ark/include/kernels/gemm_cutlass.h
@@ -200,7 +200,7 @@ template <typename DataTypeA, int LeadingDimA, bool IsColumnA,
           typename DataTypeB, int LeadingDimB, bool IsColumnB,
           typename DataTypeC, int LeadingDimC, int ProblemSizeM,
           int ProblemSizeN, int ProblemSizeK, int TileSizeM, int TileSizeN,
-          int TileSizeK, typename UnitOp>
+          typename UnitOp>
 DEVICE void gemm_cuda(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
                       int smem_per_warp) {
 #if (ARK_TARGET_CUDA_ARCH == 60)
@@ -223,6 +223,7 @@ DEVICE void gemm_cuda(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
         cutlass::layout::RowMajor>::type;
     using LayoutC = cutlass::layout::RowMajor;
 
+    static constexpr int TileSizeK = std::is_same_v<DataTypeC, float> ? 32 : 64;
     using GemmKernel = typename ark::GemmConfiguration<
         UnitOp, cutlass::arch::OpClassTensorOp, ArchTag, DataTypeA, LayoutA,
         DataTypeB, LayoutB, DataTypeC, LayoutC,
@@ -404,7 +405,7 @@ template <typename DataTypeA, int LeadingDimA, bool IsColumnA,
           typename DataTypeB, int LeadingDimB, bool IsColumnB,
           typename DataTypeC, int LeadingDimC, int ProblemSizeM,
           int ProblemSizeN, int ProblemSizeK, int TileSizeM, int TileSizeN,
-          int TileSizeK, typename UnitOp>
+          typename UnitOp>
 DEVICE void gemm_cutlass(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
                          int smem_per_warp) {
     using CutDataTypeA = typename cutlass::platform::conditional<
@@ -433,13 +434,13 @@ DEVICE void gemm_cutlass(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
      ARK_TARGET_CUDA_ARCH == 80)
     gemm_cuda<CutDataTypeA, LeadingDimA, IsColumnA, CutDataTypeB, LeadingDimB,
               IsColumnB, CutDataTypeC, LeadingDimC, ProblemSizeM, ProblemSizeN,
-              ProblemSizeK, TileSizeM, TileSizeN, TileSizeK, UnitOp>(
-        pC, pA, pB, uop_idx, smem_per_warp);
+              ProblemSizeK, TileSizeM, TileSizeN, UnitOp>(pC, pA, pB, uop_idx,
+                                                          smem_per_warp);
 #elif (ARK_TARGET_CUDA_ARCH == 90)
     gemm_cuda_90<CutDataTypeA, LeadingDimA, IsColumnA, CutDataTypeB,
                  LeadingDimB, IsColumnB, CutDataTypeC, LeadingDimC,
                  ProblemSizeM, ProblemSizeN, ProblemSizeK, TileSizeM, TileSizeN,
-                 TileSizeK, UnitOp>(pC, pA, pB, uop_idx, smem_per_warp);
+                 UnitOp>(pC, pA, pB, uop_idx, smem_per_warp);
 #else
     static_assert(false, "Unsupported CUDA arch.");
 #endif
diff --git a/ark/include/kernels/matmul.h b/ark/include/kernels/matmul.h
index 3b97a3907..fd6c33d0f 100644
--- a/ark/include/kernels/matmul.h
+++ b/ark/include/kernels/matmul.h
@@ -21,7 +21,7 @@ namespace ark {
 /// @tparam OutDims (ark::Vec) Output tensor leading dimensions.
 /// @tparam NCA (ark::Vec) A 2D vector with N and C dimensions of matrix A.
 /// @tparam NCB (ark::Vec) A 2D vector with N and C dimensions of matrix B.
-/// @tparam TileShape (ark::Vec) The tile shape of matmul computation (m, n, k).
+/// @tparam TileShape (ark::Vec) The output tile shape.
 /// @tparam ProblemSize (ark::Vec) The problem size of matmul computation
 /// (m, n, k).
 /// @tparam LeadingDims (ark::Vec) The leading dimensions of matrix inputs
@@ -44,7 +44,8 @@ DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
                   "NCA should be two dimensional.");
     static_assert(NCB::D2 == 1 && NCB::D3 == 1,
                   "NCB should be two dimensional.");
-    static_assert(TileShape::D3 == 1, "TileShape should be three dimensional.");
+    static_assert(TileShape::D2 == 1 && TileShape::D3 == 1,
+                  "TileShape should be two dimensional.");
     static_assert(ProblemSize::D3 == 1,
                   "ProblemSize should be three dimensional.");
 
@@ -65,7 +66,6 @@ DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
     constexpr int ProblemSizeK = ProblemSize::D2;
     constexpr int TileSizeM = TileShape::D0;
     constexpr int TileSizeN = TileShape::D1;
-    constexpr int TileSizeK = TileShape::D2;
 
     constexpr DimType SizeA = math::mul<OutDims::H, InnerLdimA>::value;
     constexpr DimType SizeB = math::mul<OutDims::W, InnerLdimB>::value;
@@ -103,13 +103,13 @@ DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
 #if defined(ARK_TARGET_CUDA_ARCH)
     gemm_cutlass<DataTypeA, LeadingDimA, IsColumnA, DataTypeB, LeadingDimB,
                  IsColumnB, DataTypeC, LeadingDimC, ProblemSizeM, ProblemSizeN,
-                 ProblemSizeK, TileSizeM, TileSizeN, TileSizeK, UnitOp>(
+                 ProblemSizeK, TileSizeM, TileSizeN, UnitOp>(
         pC, pA, pB, uop_idx, smem_per_warp);
 #elif defined(ARK_TARGET_ROCM_ARCH)
     gemm_ck<DataTypeA, LeadingDimA, IsColumnA, DataTypeB, LeadingDimB,
             IsColumnB, DataTypeC, LeadingDimC, ProblemSizeM, ProblemSizeN,
-            ProblemSizeK, TileSizeM, TileSizeN, TileSizeK, UnitOp>(
-        pC, pA, pB, uop_idx, smem_per_warp);
+            ProblemSizeK, TileSizeM, TileSizeN, UnitOp>(pC, pA, pB, uop_idx,
+                                                        smem_per_warp);
 #endif
     UnitOp::sync_threads();
 }
diff --git a/ark/model/model_op.hpp b/ark/model/model_op.hpp
index f7323d6c0..ab261eb20 100644
--- a/ark/model/model_op.hpp
+++ b/ark/model/model_op.hpp
@@ -57,7 +57,7 @@ class ModelOp {
 
     virtual Json default_config(
         [[maybe_unused]] const ArchRef arch = ARCH_ANY) const {
-        return {{"NumTasks", 0}, {"NumWarps", 0}, {"SramBytes", 0}};
+        return {{"NumWarps", 0}, {"SramBytes", 0}};
     }
 
     void set_name(const std::string &name) { name_ = name; }
diff --git a/ark/ops/ops_arithmetic_test.cpp b/ark/ops/ops_arithmetic_test.cpp
index 772da3276..6a878c667 100644
--- a/ark/ops/ops_arithmetic_test.cpp
+++ b/ark/ops/ops_arithmetic_test.cpp
@@ -216,6 +216,21 @@ ark::unittest::State test_add_broadcast() {
     return ark::unittest::SUCCESS;
 }
 
+ark::unittest::State test_add_offset() {
+    {
+        ark::Model m;
+        ark::Tensor t0 = m.tensor({2, 64}, ark::FP16, {4, 128}, {2, 64});
+        ark::Tensor t1 = m.tensor({2, 64}, ark::FP16);
+        ark::Tensor out = m.add(t0, t1);
+
+        auto result = ark::op_test("add_offset", m, {t0, t1}, {out},
+                                   baseline_add<ark::half_t>);
+        UNITTEST_LOG(result);
+        UNITTEST_EQ(result.max_diff[0], 0.0f);
+    }
+    return ark::unittest::SUCCESS;
+}
+
 ark::unittest::State test_add_invalid() {
     {
         ark::Model m;
@@ -421,6 +436,7 @@ int main() {
     UNITTEST(test_add_bf16);
     UNITTEST(test_add_overwrite);
     UNITTEST(test_add_broadcast);
+    UNITTEST(test_add_offset);
     UNITTEST(test_add_invalid);
     UNITTEST(test_sub_fp32);
     UNITTEST(test_sub_invalid);
diff --git a/ark/ops/ops_broadcast.cpp b/ark/ops/ops_broadcast.cpp
index e5559fc32..2fd02b801 100644
--- a/ark/ops/ops_broadcast.cpp
+++ b/ark/ops/ops_broadcast.cpp
@@ -94,7 +94,7 @@ ModelOpBroadcast2::ModelOpBroadcast2(const std::string &type_name,
 std::string ModelOpBroadcast2::impl_name(const Json &config) const {
     check_fields_config(config, {"NumWarps", "Tile"});
     int num_warps = config["NumWarps"];
-    auto &tile_shape = config["Tile"];
+    Dims unit_out_dims(config.at("Tile").get<std::vector<DimType>>());
 
     return function_name_string(
         pascal_to_snake(type()->type_name()),
@@ -104,8 +104,8 @@ std::string ModelOpBroadcast2::impl_name(const Json &config) const {
          vec_string(read_tensors_[1]->shape().dims4()),
          vec_string(write_tensors_[0]->strides().dims4()),
          vec_string(write_tensors_[0]->shape().dims4()),
-         vec_string({1, 1, tile_shape[0], tile_shape[1]}),
-         std::to_string(num_warps), std::to_string(0)});
+         vec_string(unit_out_dims.dims4()), std::to_string(num_warps),
+         std::to_string(0)});
 }
 
 std::vector<ModelOpArg> ModelOpBroadcast2::impl_args([
diff --git a/ark/ops/ops_embedding.cpp b/ark/ops/ops_embedding.cpp
index 2e2626d4c..2d6b63720 100644
--- a/ark/ops/ops_embedding.cpp
+++ b/ark/ops/ops_embedding.cpp
@@ -21,9 +21,9 @@ ModelOpEmbedding::ModelOpEmbedding(ModelTensorRef input, ModelTensorRef weight,
     if (output) {
         check_match_data_type(weight, output);
     } else {
-        Dims input_shape = input->shape().dims4();
-        Dims output_shape(input_shape[1], input_shape[2], input_shape[3],
-                          weight->shape()[-1]);
+        auto shape_vec = input->shape().vector();
+        shape_vec.push_back(weight->shape()[-1]);
+        Dims output_shape(shape_vec);
         output = std::make_shared<ModelTensor>(
             weight->data_type(), std::make_shared<ModelBuffer>(), output_shape);
     }
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index dca349f44..ef5f516b0 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -98,7 +98,7 @@ ModelOpMatmul::ModelOpMatmul(ModelTensorRef input, ModelTensorRef other,
 }
 
 std::string ModelOpMatmul::impl_name(const Json &config) const {
-    check_fields_config(config, {"NumWarps", "SramBytes", "TileShapeMNK"});
+    check_fields_config(config, {"NumWarps", "SramBytes", "Tile"});
     check_fields_args(args_, {"TransposeInput", "TransposeOther"});
 
     bool trans_input = args_.at("TransposeInput").value<bool>();
@@ -132,14 +132,14 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
 
     int num_warps = config["NumWarps"];
     int smem_bytes = config["SramBytes"];
-    Dims tile_shape_mnk = config["TileShapeMNK"].get<std::vector<DimType>>();
-    if (tile_shape_mnk.ndims() != 3) {
-        ERR(PlanError, "TileShapeMNK should have 3 elements");
+    Dims tile_shape = config["Tile"].get<std::vector<DimType>>();
+    if (tile_shape.ndims() != 2) {
+        ERR(PlanError, "Tile should have 2 elements");
     }
-    for (int i = 0; i < 3; ++i) {
-        if (padded_problem_size[i] % tile_shape_mnk[i] != 0) {
-            ERR(PlanError, "output padded shape MNK ", padded_problem_size,
-                " should be divisible by tile shape MNK ", tile_shape_mnk);
+    for (int i = 0; i < 2; ++i) {
+        if (padded_output_shape[i - 2] % tile_shape[i] != 0) {
+            ERR(PlanError, "output padded shape ", padded_output_shape,
+                " should be divisible by tile shape ", tile_shape);
         }
     }
 
@@ -161,7 +161,7 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
                                     vec_string(output->strides().dims4()),
                                     vec_string(input_dim_nc),
                                     vec_string(other_dim_nc),
-                                    vec_string(tile_shape_mnk),
+                                    vec_string(tile_shape),
                                     vec_string(padded_problem_size),
                                     vec_string(strides_acdb),
                                     std::to_string(inner_stride_a),
@@ -191,29 +191,17 @@ static const Json get_default_config(const ArchRef arch,
     DimType tm = (mnk[0] > mnk[1]) ? 256 : 128;
     DimType tn = (mnk[0] > mnk[1]) ? 128 : 256;
     if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP32.ref()) {
-        return {{"NumWarps", 8},
-                {"SramBytes", 147456},
-                {"TileShapeMNK", {tm, tn, 32}}};
+        return {{"NumWarps", 8}, {"SramBytes", 147456}, {"Tile", {tm, tn}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == FP16.ref()) {
-        return {{"NumWarps", 8},
-                {"SramBytes", 147456},
-                {"TileShapeMNK", {tm, tn, 64}}};
+        return {{"NumWarps", 8}, {"SramBytes", 147456}, {"Tile", {tm, tn}}};
     } else if (arch->belongs_to(ARCH_CUDA_80) && data_type == BF16.ref()) {
-        return {{"NumWarps", 8},
-                {"SramBytes", 147456},
-                {"TileShapeMNK", {tm, tn, 64}}};
+        return {{"NumWarps", 8}, {"SramBytes", 147456}, {"Tile", {tm, tn}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP32.ref()) {
-        return {{"NumWarps", 4},
-                {"SramBytes", 24672},
-                {"TileShapeMNK", {tm, tn, 16}}};
+        return {{"NumWarps", 4}, {"SramBytes", 24672}, {"Tile", {tm, tn}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == FP16.ref()) {
-        return {{"NumWarps", 4},
-                {"SramBytes", 24672},
-                {"TileShapeMNK", {tm, tn, 32}}};
+        return {{"NumWarps", 4}, {"SramBytes", 24672}, {"Tile", {tm, tn}}};
     } else if (arch->belongs_to(ARCH_ROCM_942) && data_type == BF16.ref()) {
-        return {{"NumWarps", 4},
-                {"SramBytes", 24624},
-                {"TileShapeMNK", {tm, tn, 32}}};
+        return {{"NumWarps", 4}, {"SramBytes", 24624}, {"Tile", {tm, tn}}};
     }
     ERR(InternalError, "Unexpected error");
     return {};
@@ -227,18 +215,12 @@ Json ModelOpMatmul::default_config(const ArchRef arch) const {
                                  args_.at("TransposeInput").value<bool>(),
                                  args_.at("TransposeOther").value<bool>());
     Json config = get_default_config(arch, result->data_type(), mnk);
-    size_t tile_x = config.at("TileShapeMNK")[0];
-    size_t tile_y = config.at("TileShapeMNK")[1];
+    size_t tile_x = config.at("Tile")[0];
+    size_t tile_y = config.at("Tile")[1];
     if (mnk[0] % tile_x != 0 || mnk[1] % tile_y != 0) {
-        ERR(PlanError, "output padded shape MNK ", mnk,
-            " should be divisible by tile shape MNK ",
-            config.at("TileShapeMNK"));
+        ERR(PlanError, "output padded shape ", Dims{mnk[0], mnk[1]},
+            " should be divisible by tile shape ", config.at("Tile"));
     }
-    Dims result_shape = result->shape().dims4();
-    size_t num_tasks = result_shape[0] * result_shape[1];
-    num_tasks *= mnk[0] / tile_x;
-    num_tasks *= mnk[1] / tile_y;
-    config["NumTasks"] = num_tasks;
     return config;
 }
 
diff --git a/ark/ops/ops_reshape.cpp b/ark/ops/ops_reshape.cpp
index aac22b71a..f8f5e942c 100644
--- a/ark/ops/ops_reshape.cpp
+++ b/ark/ops/ops_reshape.cpp
@@ -11,22 +11,28 @@
 
 namespace ark {
 
-// Reshape `input` to `shape`. This interface does not support -1 as a dimension
-// of `shape`, because Dims does not allow -1 as a valid dimension.
+// Reshape `input` to `inferred_shape`. This interface does not support -1 as a
+// dimension of `inferred_shape`.
 static void reshape_helper(ModelTensorRef input, const Dims &inferred_shape,
                            bool allowzero, Dims &new_shape, Dims &new_strides,
                            Dims &new_offs) {
     const auto &orig_shape = input->shape();
     const auto &orig_strides = input->strides();
     const auto &orig_offsets = input->offsets();
+
+    std::stringstream ss;
+    ss << "reshape failed as the number of elements mismatch: reshape from "
+       << orig_shape << " to " << inferred_shape
+       << " (allowzero = " << allowzero << ")";
+    auto nelems_mismatch_error = ss.str();
+
     // Calculate the new shape
     std::vector<DimType> new_shape_vec;
     if (inferred_shape.ndims() == 0) {
         // Convert to a scalar
         new_shape_vec.emplace_back(1);
         if (orig_shape.nelems() != 1) {
-            ERR(ModelError, "number of elements mismatch: reshape from ",
-                orig_shape, " to ", inferred_shape);
+            ERR(ModelError, nelems_mismatch_error);
         }
     } else {
         DimType total_size = 1;
@@ -46,13 +52,12 @@ static void reshape_helper(ModelTensorRef input, const Dims &inferred_shape,
             }
         }
         if (orig_shape.nelems() != total_size) {
-            ERR(ModelError, "number of elements mismatch: reshape from ",
-                orig_shape, " to ", inferred_shape);
+            ERR(ModelError, nelems_mismatch_error);
         }
     }
     new_shape = new_shape_vec;
 
-    std::stringstream ss;
+    ss = std::stringstream();
     ss << "reshape failed as the strides of the input tensor is incompatible "
           "with the new shape. A workaround is copying the input tensor to a "
           "new tensor, so that the data becomes sequential in memory. ";
@@ -104,12 +109,26 @@ static void reshape_helper(ModelTensorRef input, const Dims &inferred_shape,
         } else {
             if (orig_strides[orig_idx] != orig_shape[orig_idx] ||
                 orig_offsets[orig_idx] != 0) {
-                ERR(ModelError, incompatible_strides_error);
-            }
-            orig_idx--;
-            if (orig_idx >= 0) {
-                orig_shape_stack *= orig_shape[orig_idx];
-                orig_strides_stack *= orig_strides[orig_idx];
+                if (orig_shape[orig_idx] != 1 || reverse_strides.empty()) {
+                    ERR(ModelError, incompatible_strides_error);
+                }
+                *reverse_strides.rbegin() *= orig_strides[orig_idx];
+                DimType new_off = orig_offsets[orig_idx];
+                for (auto i = orig_idx + 1; i < orig_strides.ndims(); i++) {
+                    new_off *= orig_strides[i];
+                }
+                *reverse_offsets.rbegin() = new_off;
+                orig_idx--;
+                if (orig_idx >= 0) {
+                    orig_shape_stack = orig_shape[orig_idx];
+                    orig_strides_stack = orig_strides[orig_idx];
+                }
+            } else {
+                orig_idx--;
+                if (orig_idx >= 0) {
+                    orig_shape_stack *= orig_shape[orig_idx];
+                    orig_strides_stack *= orig_strides[orig_idx];
+                }
             }
         }
     }
diff --git a/ark/ops/ops_reshape_test.cpp b/ark/ops/ops_reshape_test.cpp
index 7bb8aa4be..550476199 100644
--- a/ark/ops/ops_reshape_test.cpp
+++ b/ark/ops/ops_reshape_test.cpp
@@ -207,6 +207,38 @@ ark::unittest::State test_reshape_padded() {
 
         test_reshape_checker(model, tns0, tns1, "test_reshape_padded");
     }
+    {
+        ark::Model model;
+        ark::Tensor tns0 =
+            model.tensor({1024, 1, 128}, ark::FP32, {1024, 64, 128}, {0, 8, 0});
+        ark::Tensor tns1 = model.reshape(tns0, {1024, 128});
+
+        UNITTEST_EQ(tns1.shape(), ark::Dims(1024, 128));
+        UNITTEST_EQ(tns1.strides(), ark::Dims(1024, 8192));
+        UNITTEST_EQ(tns1.offsets(), ark::Dims(0, 1024));
+
+        // For preventing optimize-out
+        model.noop(tns0);
+        model.noop(tns1);
+
+        test_reshape_checker(model, tns0, tns1, "test_reshape_padded");
+    }
+    {
+        ark::Model model;
+        ark::Tensor tns0 =
+            model.tensor({1024, 2, 128}, ark::FP32, {1024, 64, 128}, {0, 8, 0});
+        ark::Tensor tns1 = model.reshape(tns0, {1024, 256});
+
+        UNITTEST_EQ(tns1.shape(), ark::Dims(1024, 256));
+        UNITTEST_EQ(tns1.strides(), ark::Dims(1024, 8192));
+        UNITTEST_EQ(tns1.offsets(), ark::Dims(0, 1024));
+
+        // For preventing optimize-out
+        model.noop(tns0);
+        model.noop(tns1);
+
+        test_reshape_checker(model, tns0, tns1, "test_reshape_padded");
+    }
     return ark::unittest::SUCCESS;
 }
 
@@ -268,6 +300,12 @@ ark::unittest::State test_reshape_invalid() {
         ark::Tensor tns = model.tensor({64, 256}, ark::FP32, {64, 512});
         UNITTEST_THROW(model.reshape(tns, {16384}), ark::ModelError);
     }
+    {
+        ark::Model model;
+        ark::Tensor tns =
+            model.tensor({1024, 1}, ark::FP32, {1024, 64}, {0, 8});
+        UNITTEST_THROW(model.reshape(tns, {1024}), ark::ModelError);
+    }
     return ark::unittest::SUCCESS;
 }
 
diff --git a/ark/ops/ops_test_common.cpp b/ark/ops/ops_test_common.cpp
index 42f7e670e..bfbe79a70 100644
--- a/ark/ops/ops_test_common.cpp
+++ b/ark/ops/ops_test_common.cpp
@@ -9,6 +9,7 @@
 #include "ark/model.hpp"
 #include "ark/planner.hpp"
 #include "ark/random.hpp"
+#include "cpu_timer.h"
 #include "env.h"
 #include "gpu/gpu_logging.hpp"
 #include "logging.hpp"
@@ -194,17 +195,21 @@ OpsTestResult op_test(
         // use a magic number here.
         int iter = 1000;
         exe.launch();
+        double start = cpu_timer();
         exe.run(iter);
-        float msec = exe.stop();
+        exe.stop();
+        double msec = (cpu_timer() - start) * 1000;
         result.iter = iter;
         result.msec_per_iter = msec / iter;
     } else {
         // Rough measure.
         int warmup_iter = 3;
-        float target_msec = 5000;
+        double target_msec = 5000;
         exe.launch();
+        double start = cpu_timer();
         exe.run(warmup_iter);
-        float warmup_msec = exe.stop();
+        exe.stop();
+        double warmup_msec = (cpu_timer() - start) * 1000;
 
         if (warmup_msec > target_msec) {
             // Warm-up was long enough.
@@ -213,8 +218,10 @@ OpsTestResult op_test(
         } else {
             int iter = int(target_msec / warmup_msec) * warmup_iter;
             exe.launch();
+            start = cpu_timer();
             exe.run(iter);
-            float msec = exe.stop();
+            exe.stop();
+            double msec = (cpu_timer() - start) * 1000;
             result.iter = iter;
             result.msec_per_iter = msec / iter;
         }
diff --git a/ark/ops/ops_transpose.cpp b/ark/ops/ops_transpose.cpp
index d0f7581cc..b7a67c8c0 100644
--- a/ark/ops/ops_transpose.cpp
+++ b/ark/ops/ops_transpose.cpp
@@ -85,10 +85,20 @@ std::string ModelOpTranspose::impl_name(const Json &config) const {
     auto permutation = args_.at("Permutation").value<Dims>();
     auto perm_str = permutation_str(permutation);
     int num_warps = config["NumWarps"];
-    auto &tile_shape = config["Tile"];
-    Dims unit_out_dims{tile_shape[0], tile_shape[1]};
-    if (tile_shape[0] < 0) unit_out_dims[0] = write_tensors_[0]->strides()[-2];
-    if (tile_shape[1] < 0) unit_out_dims[1] = write_tensors_[0]->strides()[-1];
+    Dims unit_out_dims{config["Tile"].get<std::vector<DimType>>()};
+    auto result_tensor_shape = result_tensors_[0]->shape();
+    if (unit_out_dims.ndims() > result_tensor_shape.ndims()) {
+        ERR(ModelError,
+            "The number of dimensions of Tile should be less than or equal to "
+            "the number of dimensions of the result tensor. Given Tile: ",
+            unit_out_dims, ", output tensor shape: ", result_tensor_shape);
+    }
+    int ndims = unit_out_dims.ndims();
+    for (int i = 0; i < ndims; ++i) {
+        if (unit_out_dims[i] < 0) {
+            unit_out_dims[i] = result_tensor_shape[i - ndims];
+        }
+    }
 
     return function_name_string(
         "transpose" + perm_str,
diff --git a/ark/ops/ops_transpose_test.cpp b/ark/ops/ops_transpose_test.cpp
index 999d2c6e9..139e1ee66 100644
--- a/ark/ops/ops_transpose_test.cpp
+++ b/ark/ops/ops_transpose_test.cpp
@@ -2,9 +2,13 @@
 // Licensed under the MIT license.
 
 #include "ark/model.hpp"
+#include "ark/planner.hpp"
+#include "model/model_json.hpp"
 #include "ops_test_common.hpp"
 #include "unittest/unittest_utils.h"
 
+#define SYNC_TEST 0
+
 template <typename T>
 void baseline_transpose_0132(std::vector<void *> &outputs,
                              const std::vector<ark::Dims> &output_shapes,
@@ -53,6 +57,41 @@ void baseline_transpose_0231(std::vector<void *> &outputs,
     }
 };
 
+template <typename T>
+void baseline_transpose_0213(std::vector<void *> &outputs,
+                             const std::vector<ark::Dims> &output_shapes,
+                             const std::vector<void *> &inputs,
+                             const std::vector<ark::Dims> &input_shapes, int) {
+    T *out = static_cast<T *>(outputs[0]);
+    T *in = static_cast<T *>(inputs[0]);
+    ark::Dims osh = output_shapes[0].dims4();
+    ark::Dims ish = input_shapes[0].dims4();
+    for (ark::DimType n = 0; n < ish[0]; ++n) {
+        for (ark::DimType c = 0; c < ish[1]; ++c) {
+            for (ark::DimType h = 0; h < ish[2]; ++h) {
+                for (ark::DimType w = 0; w < ish[3]; ++w) {
+                    // out[n][h][c][w] = in[n][c][h][w]
+                    out[w + c * osh[3] + h * osh[2] * osh[3] +
+                        n * osh[1] * osh[2] * osh[3]] =
+                        in[w + h * ish[3] + c * ish[3] * ish[2] +
+                           n * ish[3] * ish[2] * ish[1]];
+                }
+            }
+        }
+    }
+};
+
+template <typename T>
+void baseline_transpose_sync_test(std::vector<void *> &outputs,
+                                  const std::vector<ark::Dims> &,
+                                  const std::vector<void *> &inputs,
+                                  const std::vector<ark::Dims> &input_shapes,
+                                  int) {
+    T *out = static_cast<T *>(outputs[0]);
+    T *in = static_cast<T *>(inputs[0]);
+    ::memcpy(out, in, sizeof(T) * input_shapes[0].nelems());
+};
+
 ark::unittest::State test_transpose_0132_fp32() {
     ark::Model m;
     ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP32);
@@ -125,6 +164,80 @@ ark::unittest::State test_transpose_0231_bf16() {
     return ark::unittest::SUCCESS;
 }
 
+ark::unittest::State test_transpose_0213_fp32() {
+    ark::Model m;
+    ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::FP32);
+    ark::Tensor out = m.transpose(t, {0, 2, 1, 3});
+
+    auto result = ark::op_test("transpose_0213_fp32", m, {t}, {out},
+                               baseline_transpose_0213<float>);
+    UNITTEST_LOG(result);
+    UNITTEST_EQ(result.max_diff[0], 0.0f);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_transpose_0213_fp16() {
+    ark::Model m;
+    ark::PlannerContext ctx(m);
+    ctx.warp_range(0, 4);
+    ctx.sram_range(0, 0);
+    ctx.sync(false);
+    ctx.config(ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 64}}})
+                   .dump());
+
+    ark::Tensor t = m.tensor({5, 256, 32, 128}, ark::FP16);
+    ark::Tensor out = m.transpose(t, {0, 2, 1, 3});
+
+    auto result = ark::op_test("transpose_0213_fp16", m, {t}, {out},
+                               baseline_transpose_0213<ark::half_t>);
+    UNITTEST_LOG(result);
+    UNITTEST_EQ(result.max_diff[0], 0.0f);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_transpose_0213_bf16() {
+    ark::Model m;
+    ark::Tensor t = m.tensor({5, 3, 32, 128}, ark::BF16);
+    ark::Tensor out = m.transpose(t, {0, 2, 1, 3});
+
+    auto result = ark::op_test("transpose_0213_bf16", m, {t}, {out},
+                               baseline_transpose_0213<ark::bfloat16_t>);
+    UNITTEST_LOG(result);
+    UNITTEST_EQ(result.max_diff[0], 0.0f);
+    return ark::unittest::SUCCESS;
+}
+
+ark::unittest::State test_transpose_sync_test() {
+    ark::Model m;
+    ark::PlannerContext shared_ctx(m);
+    shared_ctx.warp_range(0, 4);
+    shared_ctx.sram_range(0, 0);
+    shared_ctx.sync(false);
+
+    ark::Tensor in, t, out;
+    in = m.tensor({1, 16, 2, 64}, ark::FP16);
+    {
+        ark::PlannerContext ctx(m);
+        ctx.config(
+            ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 64}}})
+                .dump());
+        t = m.transpose(in, {0, 2, 1, 3});
+    }
+    {
+        ark::PlannerContext ctx(m);
+        ctx.config(
+            ark::Json({{"NumWarps", 4}, {"SramBytes", 0}, {"Tile", {8, 1, 64}}})
+                .dump());
+        out = m.transpose(t, {0, 2, 1, 3});
+    }
+
+    auto result = ark::op_test("transpose_sync_test", m, {in}, {out},
+                               baseline_transpose_sync_test<ark::half_t>);
+    UNITTEST_LOG(result);
+    UNITTEST_EQ(result.max_diff[0], 0.0f);
+    return ark::unittest::SUCCESS;
+}
+
 ark::unittest::State test_transpose_invalid() {
     {
         ark::Model m;
@@ -157,6 +270,12 @@ int main() {
     UNITTEST(test_transpose_0231_fp32);
     UNITTEST(test_transpose_0231_fp16);
     UNITTEST(test_transpose_0231_bf16);
+    UNITTEST(test_transpose_0213_fp32);
+    UNITTEST(test_transpose_0213_fp16);
+    UNITTEST(test_transpose_0213_bf16);
+#if (SYNC_TEST)
+    UNITTEST(test_transpose_sync_test);
+#endif
     UNITTEST(test_transpose_invalid);
     return ark::unittest::SUCCESS;
 }
diff --git a/docs/plan_file.md b/docs/plan_file.md
index c06ccc35d..2f93b51a0 100644
--- a/docs/plan_file.md
+++ b/docs/plan_file.md
@@ -75,47 +75,36 @@ Structure of an `Op` object in a plan file is the same as [the one in the model
 
 ### Config Details
 
-The followings explain a few fields that many configs commonly consist of.
+The followings explain a few fields that many configs consist of.
 
-- `NumWarps`: number of concurrent warps needed to calculate a single output tile.
-- `SramBytes`: bytes of SRAM needed to calculate a single output tile.
-- `NumTasks`: total number of output tiles need to compute.
+- `Tile` (Optional): up-to-4-dimensional shape of a single tile. A tile refers to elements that each task calculates for the first result tensor. The shape of the first result tensor should be divisible by the tile shape. `Tile` may not be needed depending on the operator type.
+- `NumWarps`: number of concurrent warps needed to calculate a single tile.
+- `SramBytes`: bytes of SRAM needed to calculate a single tile.
+- `NumTasks` (Optional): total number of tiles need to compute. If `NumTasks` is not provided, it will be calculated as the number of elements in the first result tensor divided by the number of elements in a single `Tile`. If both `NumTasks` and `Tile` are not provided, no computation will be conducted (regarded as `NumTask == 0`).
 
 The followings describe `Config` structure of different types of operators.
 
-- `Matmul`
-    - `NumWarps`
-    - `SramBytes`
-    - `NumTasks`
-    - `TileShapeMNK`: tile shape of matrix multiplication in the [M,N,K] format.
-    - `TilePadMNK`: this field is not well defined and will be updated in the future. Currently, it should be the same as `TileShapeMNK`.
-
 - `ReduceSum`, `ReduceMax`, `ReduceMean`
+    - `Tile` (Optional)
     - `NumWarps`
     - `SramBytes`
-    - `NumTasks`
+    - `NumTasks` (Optional)
     - `ImplType`: type of reduction implementation, either `WarpWise` or `ElementWise`.
 
 - `Send`, `SendDone`, `Recv`
-    - `NumWarps`: should be always 1.
+    - `NumWarps`
     - `SramBytes`: should be always 0.
     - `NumTasks`: should be always 1.
 
-- `Embedding`
-    - `NumWarps`
-    - `SramBytes`
-    - `NumTasks`
-
 - `Noop`
     - `NumWarps`: should be always 1.
     - `SramBytes`: should be always 0.
-    - `NumTasks`: should be always 0.
 
 - `Default`: all other operators that are not listed above follow this structure.
+    - `Tile` (Optional)
     - `NumWarps`
     - `SramBytes`
-    - `NumTasks`
-    - `Tile`: 2-dimensional shape of a single output tile.
+    - `NumTasks` (Optional)
 
 ## ProcessorGroup
 
@@ -134,6 +123,6 @@ A `ResourceGroup` object describes computing tasks that use the entire or a subs
 
 ## TaskGroup
 
-A `TaskGroup` object describes computing tasks. Each task can be typically considered as computing a single output tile of an operator. The `TaskId` field declares the type of task, of which details are found from `TaskInfos`. The `TaskRange` field declares tasks to run, which should be within the range `[0, NumTasks)` where `NumTasks` is found from `Config` of operators in the `TaskInfo`. If there are multiple operators in a `TaskInfo`, all operators should have the same `NumTasks`.
+A `TaskGroup` object describes computing tasks. Each task can be typically considered as computing a single result tile of an operator. The `TaskId` field declares the type of task, of which details are found from `TaskInfos`. The `TaskRange` field declares tasks to run, which should be within the range `[0, NumTasks)` where `NumTasks` is found from `Config` of operators in the `TaskInfo`. If there are multiple operators in a `TaskInfo`, all operators should have the same `NumTasks`.
 
 Tasks in the `TaskRange` are distributed across processors in the resource group. If `Granularity` is 1, the distribution is round-robin. Otherwise, the distribution assigns `Granularity` consequent tasks to each processor (as long as there are enough tasks), and then assign the following task to the next processor. `Granularity` should be always a positive integer.
diff --git a/examples/llama/model.py b/examples/llama/model.py
index 925615bf3..a023bb688 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -9,7 +9,7 @@
 import math
 from dataclasses import dataclass
 from typing import Optional
-import os
+from ark import PlannerContext as Context
 
 
 @dataclass
@@ -90,13 +90,42 @@ def __init__(
         self.weight = ark.parameter([1, 1, dim], ark.fp32)
 
     def forward(self, x):
-        x = ark.cast(x, ark.fp32)
-        x2 = ark.mul(x, x)
-        mean = ark.reduce_mean(x2, axis=-1)
-        rrms = ark.rsqrt(mean)
-        x = ark.mul(x, rrms)
-        x = ark.mul(x, self.weight, x)
-        return ark.cast(x, self.dtype)
+        with Context(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Granularity": 7,
+            },
+        ):
+            with Context(config={"Tile": [1, 4096]}):
+                x = ark.cast(x, ark.fp32)
+                x2 = ark.mul(x, x)
+            with Context(config={"Tile": [1], "ImplType": "WarpWise"}):
+                mean = ark.reduce_mean(x2, axis=-1)
+        with Context(
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Tile": [64, 1],
+            }
+        ):
+            mean = ark.add(mean, self.eps)
+            rrms = ark.rsqrt(mean)
+        with Context(
+            warp_range=[0, 8],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+                "Tile": [1, 4096],
+                "Granularity": 7,
+            },
+        ):
+            x = ark.mul(x, rrms)
+            x = ark.mul(x, self.weight, x)
+            return ark.cast(x, self.dtype)
 
 
 class ColumnParallelLinear(ark.Module):
@@ -210,7 +239,7 @@ def forward(self, x):
         local_result = ark.matmul(
             input_parallel, self.weight, transpose_other=True
         )
-        reduced_result = ark.local_all_reduce(
+        reduced_result = ark.all_reduce(
             local_result, self.local_rank, self.world_size
         )
         return reduced_result
@@ -278,6 +307,22 @@ def forward(self, x):
         return ark.matmul(x, self.weight, transpose_other=True)
 
 
+# def tester(ref_func):
+#     def decorator(func):
+#         def wrapper(*args, **kwargs):
+#             data = []
+#             kdata = {}
+#             for arg in args:
+#                 if isinstance(arg, ark.Tensor):
+#                     rand_data =
+#             ref_outputs = ref_func(*args, **kwargs)
+#             outputs = func(*args, **kwargs)
+#             return outputs
+
+#         return wrapper
+#     return decorator
+
+
 class Silu(ark.Module):
     """
     Silu activation function, silu(x) = x * sigmoid(x)
@@ -324,11 +369,30 @@ def __init__(
         )
 
     def forward(self, x):
-        # self.w2(F.silu(self.w1(x)) * self.w3(x))
-        x1 = self.w1(x)
-        x1 = Silu()(x1)
-        x2 = self.w3(x)
-        x3 = ark.mul(x1, x2)
+        with Context(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+            },
+        ):
+            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
+                x1 = self.w1(x)
+            with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x1 = Silu()(x1)
+        with Context(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+            },
+        ):
+            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
+                x2 = self.w3(x)
+            with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
+                x3 = ark.mul(x1, x2)
         x4 = self.w2(x3)
         return x4
 
@@ -342,6 +406,32 @@ def apply_rotary_emb(xq, xk, freqs_cis):
     return xq_out, xk_out
 
 
+class Softmax(ark.Module):
+    def __init__(self):
+        super(Softmax, self).__init__()
+
+    def forward(self, input):
+        with Context(
+            warp_range=[0, 8],
+            sram_range=[0, 0],
+            sync=False,
+            config={
+                "NumWarps": 1,
+                "SramBytes": 0,
+            },
+        ):
+            with Context(config={"ImplType": "WarpWise"}):
+                max = ark.reduce_max(input, axis=-1)
+            with Context(config={"Tile": [1, 2048]}):
+                output = ark.sub(input, max)
+                output = ark.exp(output)
+            with Context(config={"ImplType": "WarpWise"}):
+                sum = ark.reduce_sum(output, axis=-1)
+            with Context(config={"Tile": [1, 2048]}):
+                output = ark.div(output, sum)
+            return output
+
+
 class Attention(ark.Module):
     def __init__(
         self,
@@ -401,33 +491,160 @@ def forward(
         mask: Optional[ark.Tensor],
     ):
         bsz, seqlen, _ = x.shape()
-        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-        # xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        # xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        # xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        xq = ark.reshape(xq, [bsz, seqlen, self.n_local_heads, self.head_dim])
-        xk = ark.reshape(
-            xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-        )
-        xv = ark.reshape(
-            xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-        )
-        if freqs_cis is not None:
-            xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+
+        with Context(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4},
+        ):
+            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
+                xq = self.wq(x)
+            xq = ark.reshape(
+                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+            )
+            with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
+                if freqs_cis is not None:
+                    xq = ark.rope(xq, freqs_cis)
+
+        with Context(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4},
+        ):
+            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
+                xk = self.wk(x)
+            xk = ark.reshape(
+                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
+                if freqs_cis is not None:
+                    xk = ark.rope(xk, freqs_cis)
+
+        with Context(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={"NumWarps": 4},
+        ):
+            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
+                xv = self.wv(x)
+            xv = ark.reshape(
+                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+        #     values = xv
+        #     with Context(
+        #         config={"SramBytes": 0, "Tile": [256, 1, 128]}
+        #     ):
+        #         values = ark.transpose(values, [0, 2, 1, 3])
+
+        # with Context(
+        #     warp_range=[0, 8],
+        #     sram_range=[0, 49344],
+        #     sync=False,
+        #     config={
+        #         "NumWarps": 4,
+        #         "NumTasks": 4096,
+        #         "Granularity": 2,
+        #     },
+        # ):
+        #     with Context(
+        #         config={"SramBytes": 24672, "Tile": [256, 128]}
+        #     ):
+        #         scores = ark.matmul(xq, keys, transpose_other=True)
+        #     with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
+        #         scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
+
+        # if mask is not None:
+        #     scores = ark.add(scores, mask)
+
+        # scores = Softmax()(scores)
+
+        # with Context(
+        #     warp_range=[0, 4],
+        #     sram_range=[0, 24672],
+        #     sync=False,
+        #     config={
+        #         "NumWarps": 4,
+        #         "NumTasks": 256,
+        #     },
+        # ):
+        #     with Context(
+        #         config={"SramBytes": 24672, "Tile": [256, 128]}
+        #     ):
+        #         output = ark.matmul(scores, values)
+        #     with Context(
+        #         config={"SramBytes": 0, "Tile": [256, 1, 128]}
+        #     ):
+        #         output = ark.transpose(output, [0, 2, 1, 3])
+        # output = ark.reshape(
+        #     output, [bsz, seqlen, self.head_dim * self.n_local_heads]
+        # )
+        # return self.wo(output)
+
+        # with Context(
+        #     warp_range=[0, 4],
+        #     sram_range=[0, 24672],
+        #     sync=False,
+        #     config={
+        #         "NumWarps": 4,
+        #     },
+        # ):
+        #     with Context(
+        #         config={"SramBytes": 24672, "Tile": [256, 128]}
+        #     ):
+        #         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
+        #     # xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
+        #     # xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        #     # xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
+        #     xq = ark.reshape(xq, [bsz, seqlen, self.n_local_heads, self.head_dim])
+        #     xk = ark.reshape(
+        #         xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+        #     )
+        #     xv = ark.reshape(
+        #         xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+        #     )
+        # if freqs_cis is not None:
+        #     xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
         # TODO: enable kv cache later
         keys = xk
         values = xv
         # (bs, n_local_heads, seqlen, head_dim)
-        xq = ark.transpose(xq, [0, 2, 1, 3])
+        # xq = ark.transpose(xq, [0, 2, 1, 3])
         values = ark.transpose(values, [0, 2, 1, 3])
 
-        # (bs, n_local_heads, head_dim, seqlen)
-        keys = ark.transpose(keys, [0, 2, 3, 1])
-        scores = ark.matmul(xq, keys)
-        scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
+        # (bs, n_local_heads, seqlen, head_dim)
+        # keys = ark.transpose(keys, [0, 2, 1, 3])
+        # scores = ark.matmul(xq, keys)
+
+        xq_shards = ark.sharding(xq, 2, 1)
+        keys_shards = ark.sharding(keys, 2, 1)
+        scores = ark.tensor([bsz, self.n_local_heads, seqlen, seqlen], dtype=self.dtype)
+        scores_shards = ark.sharding(scores, 1, 1)
+        results = []
+        with Context(
+            warp_range=[0, 8],
+            sram_range=[0, 49344],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "Granularity": 2,
+                "SramBytes": 24672,
+                "Tile": [256, 128],
+            },
+        ):
+            for i in range(self.n_local_heads):
+                xq_shard_reshaped = ark.reshape(xq_shards[i], [bsz, 1, seqlen, self.head_dim])
+                keys_shard_reshaped = ark.reshape(keys_shards[i], [bsz, 1, seqlen, self.head_dim])
+                scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
+                res = ark.matmul(xq_shard_reshaped, keys_shard_reshaped, scores_shard_reshaped, transpose_other=True)
+                res = ark.mul(res, 1.0 / math.sqrt(self.head_dim), res)
+                if mask is not None:
+                    res = ark.add(res, mask, res)
+                results.append(res)
+            scores = ark.identity(scores, deps=results)
 
-        if mask is not None:
-            scores = ark.add(scores, mask)
         # if self.dtype == ark.fp16:
         #     scores = ark.cast(scores, ark.fp32)
         scores = ark.softmax(scores, output=scores)
@@ -480,8 +697,25 @@ def forward(
     ):
         attention_norm_x = self.attention_norm(x)
         h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
-        h = ark.add(x, h)
-        out = ark.add(h, self.feed_forward(self.ffn_norm(h)))
+        with Context(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "SramBytes": 0,
+            },
+        ):
+            h = ark.add(x, h)
+        ff = self.feed_forward(self.ffn_norm(h))
+        with Context(
+            warp_range=[0, 4],
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "SramBytes": 0,
+            },
+        ):
+            out = ark.add(h, ff)
         return out
 
 
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 737d3ec8b..a0850f3ad 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -59,8 +59,7 @@ def run_ark(
     output = module(*module_inputs)
 
     runtime = ark.Runtime()
-    # Prefer num_warps_per_sm = 16 for nvidia and 8 for amd
-    runtime.launch(num_warps_per_sm=8)
+    runtime.launch()
 
     # Load model parameters
     if state_dict:
@@ -70,7 +69,8 @@ def run_ark(
     tensors = [i for i in module_inputs if isinstance(i, ark.Tensor)]
     tensor_data = [i for i in inputs if isinstance(i, np.ndarray)]
     for tensor, ndarray in zip(tensors, tensor_data):
-        tensor.from_numpy(ndarray)
+        if tensor.data_ptr() != 0:
+            tensor.from_numpy(ndarray)
 
     start_time = time.time()
 
@@ -447,7 +447,6 @@ def test_transformer_block(
     )
     output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
 
-    ark.Model.get_model().create_nodes()
     print(ark.Model.get_model().serialize())
 
     # test_module(
@@ -536,8 +535,8 @@ def test(args, batch_size, seq_len, dtype, rank, world_size):
     # test_row_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_column_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_attention(args, batch_size, seq_len, dtype, rank, world_size)
-    test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size)
-    # test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
+    # test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size)
+    test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
 
 
 def worker(
@@ -561,16 +560,17 @@ def worker(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument("--ckpt_dir", type=str, required=True)
-    parser.add_argument("--ngpus", type=int, default=1)
+    parser.add_argument("--ngpus", type=int, default=1, help="Number of GPUs")
+    parser.add_argument("--ckpt_dir", type=str)
 
     ckpt_dir = parser.parse_args().ckpt_dir
     ngpus = parser.parse_args().ngpus
 
     # Configurations
     args = ModelArgs7B()
+    args.n_layers = 1
     batch_size = 1
-    seq_len = 512
+    seq_len = 2048
     dtype = np.float16
     world_size = ngpus
 
@@ -578,7 +578,7 @@ def worker(
     args.vocab_size = 32000
 
     # Reduce max_seq_len due to OOM from the PyTorch model
-    args.max_seq_len = 512
+    args.max_seq_len = 2048
 
     # Verify the configurations
     assert batch_size <= args.max_batch_size
diff --git a/examples/llama/test.py b/examples/llama/test.py
new file mode 100644
index 000000000..951dc0edc
--- /dev/null
+++ b/examples/llama/test.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+import sys
+sys.path.append("llama")
+
+from typing import List, Optional
+
+import fire
+import time
+
+from llama import Llama
+import torch
+
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    seq_len: int = 128,
+    batch_size: int = 256,
+    gen_len: int = 128,
+    warmup: int = 3,
+    iteration: int = 5,
+):
+    total_len = seq_len + gen_len
+
+    generator = Llama.build(
+        ckpt_dir=ckpt_dir,
+        tokenizer_path=tokenizer_path,
+        max_seq_len=total_len,
+        max_batch_size=batch_size,
+    )
+
+    tokens = torch.randint(
+        low=0, high=generator.tokenizer.n_words - 1, size=(batch_size, total_len), dtype=torch.int32
+    )
+
+    print(f"Profiling... (seq_len={seq_len}, batch_size={batch_size}, gen_len={gen_len}, warmup={warmup}, iteration={iteration})")
+
+    def gen():
+        _ = generator.model.forward(tokens[:, :seq_len], 0)
+        for pos in range(1, gen_len):
+            _ = generator.model.forward(tokens[:, (seq_len + pos - 1):(seq_len + pos)], pos)
+
+    for _ in range(warmup):
+        gen()
+    start = time.time()
+    for _ in range(iteration):
+        gen()
+    end = time.time()
+    print(f"Elapsed: {(end - start)/iteration:.5f} sec/iteration")
+
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/python/ark/data_type.py b/python/ark/data_type.py
index 21e61e0cb..3deef50f4 100644
--- a/python/ark/data_type.py
+++ b/python/ark/data_type.py
@@ -61,9 +61,10 @@ def from_numpy(np_type: numpy.dtype) -> "DataType":
             InvalidUsageError: If there is no defined conversion from numpy data type to ark data type.
         """
         if not isinstance(np_type, numpy.dtype):
-            raise log.InvalidUsageError(
-                f"Expected a numpy data type, but got {type(np_type)}"
-            )
+            try:
+                np_type = numpy.dtype(np_type)
+            except Exception as e:
+                raise log.InvalidUsageError(f"Not a numpy data type. {str(e)}")
         for type_name, reg in REGISTRY_DATA_TYPE.items():
             if reg["np"] == np_type:
                 return DataType.from_name(type_name)
diff --git a/python/ark/module.py b/python/ark/module.py
index 9e06bcacf..55d80b8e8 100644
--- a/python/ark/module.py
+++ b/python/ark/module.py
@@ -70,6 +70,9 @@ def load_state_dict(
         all_keys = set(state_dict.keys())
         pd = self.params_dict(prefix)
         for name, param in pd.items():
+            if param.data_ptr() == 0:
+                log.WARN(f"Parameter {name} is not initialized")
+                continue
             param.from_numpy(state_dict[name])
             all_keys.remove(name)
         if all_keys:
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 8ec5b0151..70903125d 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -484,7 +484,9 @@ def parameter(
 ) -> Parameter:
     """ """
     return Parameter(
-        _cpp_tensor(shape, dtype, strides, offsets, padded_shape, None, name)
+        _cpp_tensor(
+            shape, dtype, strides, offsets, padded_shape, -1, None, name
+        )
     )
 
 
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index f876f8918..2ef77ce0e 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -272,7 +272,7 @@ class Parameter(Tensor):
     def __init__(
         self,
         tensor: CoreTensor,
-        from_torch: bool,
+        from_torch: bool = False,
     ):
         """
         Initializes a new instance of the Parameter class.

From b8c55d8588dad694f99eed581cd11c311cc1eefd Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 26 Sep 2024 00:59:03 +0000
Subject: [PATCH 087/106] a few fixes

---
 ark/api/planner.cpp     | 58 +++++++++++++++++++++++++++--------------
 examples/llama/model.py | 53 ++++++++++++++++---------------------
 2 files changed, 61 insertions(+), 50 deletions(-)

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index e1dce34ac..1c117af9a 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -170,36 +170,56 @@ std::string Planner::Impl::plan(bool pretty) const {
         check_config_field(op, config, "SramBytes");
         size_t num_warps = config["NumWarps"];
         size_t sram_bytes = config["SramBytes"];
+        size_t max_num_tasks = 0;
         size_t num_tasks;
 
-        if (!config.contains("NumTasks")) {
+        auto &result_tensors = op->result_tensors();
+        if (!result_tensors.empty() && config.contains("Tile")) {
             std::stringstream ss;
             ss << "Result shape is not divided by tile. Op: "
                << op->serialize().dump();
             auto not_divided_error = ss.str();
 
-            auto &result_tensors = op->result_tensors();
-            if (result_tensors.empty() || !config.contains("Tile")) {
-                num_tasks = 0;
-            } else {
-                const std::vector<DimType> tile_vec = config["Tile"];
-                auto tile = Dims(tile_vec);
-                auto &result_shape = result_tensors[0]->padded_shape();
-                if (result_shape.ndims() < tile.ndims()) {
+            const std::vector<DimType> tile_vec = config["Tile"];
+            auto tile = Dims(tile_vec);
+            auto &result_shape = result_tensors[0]->padded_shape();
+            if (result_shape.ndims() < tile.ndims()) {
+                ERR(PlanError, not_divided_error);
+            }
+            auto tile4 = tile.dims4();
+            auto result_shape4 = result_shape.dims4();
+            max_num_tasks = 1;
+            for (int i = 0; i < tile4.ndims(); i++) {
+                if (tile4[i] == 0 || result_shape4[i] % tile4[i] != 0) {
                     ERR(PlanError, not_divided_error);
                 }
-                auto tile4 = tile.dims4();
-                auto result_shape4 = result_shape.dims4();
-                num_tasks = 1;
-                for (int i = 0; i < tile4.ndims(); i++) {
-                    if (result_shape4[i] % tile4[i] != 0) {
-                        ERR(PlanError, not_divided_error);
-                    }
-                    num_tasks *= result_shape4[i] / tile4[i];
-                }
+                max_num_tasks *= result_shape4[i] / tile4[i];
             }
-        } else {
+            if (max_num_tasks == 0) ERR(InternalError, "max_num_tasks == 0");
+        }
+        if (config.contains("NumTasks")) {
             num_tasks = config["NumTasks"];
+            if (max_num_tasks > 0 && num_tasks > max_num_tasks) {
+                ERR(PlanError, "NumTasks (", num_tasks,
+                    ") exceeds the maximum number of tasks calculated from the "
+                    "tile (",
+                    max_num_tasks, "). Op: ", op->serialize().dump());
+            } else if (num_tasks < max_num_tasks) {
+                LOG(WARN, "NumTasks (", num_tasks,
+                    ") is less than the maximum number of tasks calculated "
+                    "from the tile (",
+                    max_num_tasks, "). Op: ", op->serialize().dump());
+            }
+        } else {
+            num_tasks = max_num_tasks;
+        }
+        if (num_tasks == 0 && op->type() != ModelOpT::from_name("Noop")) {
+            LOG(WARN,
+                "Detected a non-virtual op that does not perform any "
+                "computation. If this is unexpected, please check if "
+                "the config includes either `NumTasks` or `Tile` "
+                "field. Op: ",
+                op->serialize().dump());
         }
 
         size_t granularity = config.value("Granularity", 1);
diff --git a/examples/llama/model.py b/examples/llama/model.py
index a023bb688..7cea17410 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -406,32 +406,6 @@ def apply_rotary_emb(xq, xk, freqs_cis):
     return xq_out, xk_out
 
 
-class Softmax(ark.Module):
-    def __init__(self):
-        super(Softmax, self).__init__()
-
-    def forward(self, input):
-        with Context(
-            warp_range=[0, 8],
-            sram_range=[0, 0],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-            },
-        ):
-            with Context(config={"ImplType": "WarpWise"}):
-                max = ark.reduce_max(input, axis=-1)
-            with Context(config={"Tile": [1, 2048]}):
-                output = ark.sub(input, max)
-                output = ark.exp(output)
-            with Context(config={"ImplType": "WarpWise"}):
-                sum = ark.reduce_sum(output, axis=-1)
-            with Context(config={"Tile": [1, 2048]}):
-                output = ark.div(output, sum)
-            return output
-
-
 class Attention(ark.Module):
     def __init__(
         self,
@@ -645,11 +619,28 @@ def forward(
                 results.append(res)
             scores = ark.identity(scores, deps=results)
 
-        # if self.dtype == ark.fp16:
-        #     scores = ark.cast(scores, ark.fp32)
-        scores = ark.softmax(scores, output=scores)
-        # if self.dtype == ark.fp16:
-        #     scores = ark.cast(scores, ark.fp16)
+        def softmax(scores):
+            with Context(
+                warp_range=[0, 8],
+                sram_range=[0, 0],
+                sync=False,
+                config={
+                    "NumWarps": 1,
+                    "SramBytes": 0,
+                },
+            ):
+                with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
+                    max = ark.reduce_max(scores, axis=-1)
+                with Context(config={"Tile": [1, 2048]}):
+                    output = ark.sub(scores, max)
+                    output = ark.exp(output)
+                with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
+                    sum = ark.reduce_sum(output, axis=-1)
+                with Context(config={"Tile": [1, 2048]}):
+                    output = ark.div(output, sum)
+            return output
+
+        scores = softmax(scores)
 
         output = ark.matmul(
             scores, values

From b9f35d9da53058d77e54cebd966443f900bb6e4b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 26 Sep 2024 01:24:41 +0000
Subject: [PATCH 088/106] all transpose removed

---
 examples/llama/model.py | 117 ++++++++++------------------------------
 1 file changed, 28 insertions(+), 89 deletions(-)

diff --git a/examples/llama/model.py b/examples/llama/model.py
index 7cea17410..3d18190b2 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -507,95 +507,15 @@ def forward(
             xv = ark.reshape(
                 xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
-        #     values = xv
-        #     with Context(
-        #         config={"SramBytes": 0, "Tile": [256, 1, 128]}
-        #     ):
-        #         values = ark.transpose(values, [0, 2, 1, 3])
-
-        # with Context(
-        #     warp_range=[0, 8],
-        #     sram_range=[0, 49344],
-        #     sync=False,
-        #     config={
-        #         "NumWarps": 4,
-        #         "NumTasks": 4096,
-        #         "Granularity": 2,
-        #     },
-        # ):
-        #     with Context(
-        #         config={"SramBytes": 24672, "Tile": [256, 128]}
-        #     ):
-        #         scores = ark.matmul(xq, keys, transpose_other=True)
-        #     with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
-        #         scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim))
-
-        # if mask is not None:
-        #     scores = ark.add(scores, mask)
-
-        # scores = Softmax()(scores)
-
-        # with Context(
-        #     warp_range=[0, 4],
-        #     sram_range=[0, 24672],
-        #     sync=False,
-        #     config={
-        #         "NumWarps": 4,
-        #         "NumTasks": 256,
-        #     },
-        # ):
-        #     with Context(
-        #         config={"SramBytes": 24672, "Tile": [256, 128]}
-        #     ):
-        #         output = ark.matmul(scores, values)
-        #     with Context(
-        #         config={"SramBytes": 0, "Tile": [256, 1, 128]}
-        #     ):
-        #         output = ark.transpose(output, [0, 2, 1, 3])
-        # output = ark.reshape(
-        #     output, [bsz, seqlen, self.head_dim * self.n_local_heads]
-        # )
-        # return self.wo(output)
-
-        # with Context(
-        #     warp_range=[0, 4],
-        #     sram_range=[0, 24672],
-        #     sync=False,
-        #     config={
-        #         "NumWarps": 4,
-        #     },
-        # ):
-        #     with Context(
-        #         config={"SramBytes": 24672, "Tile": [256, 128]}
-        #     ):
-        #         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
-        #     # xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
-        #     # xk = xk.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        #     # xv = xv.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
-        #     xq = ark.reshape(xq, [bsz, seqlen, self.n_local_heads, self.head_dim])
-        #     xk = ark.reshape(
-        #         xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-        #     )
-        #     xv = ark.reshape(
-        #         xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-        #     )
-        # if freqs_cis is not None:
-        #     xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
+
         # TODO: enable kv cache later
         keys = xk
         values = xv
-        # (bs, n_local_heads, seqlen, head_dim)
-        # xq = ark.transpose(xq, [0, 2, 1, 3])
-        values = ark.transpose(values, [0, 2, 1, 3])
-
-        # (bs, n_local_heads, seqlen, head_dim)
-        # keys = ark.transpose(keys, [0, 2, 1, 3])
-        # scores = ark.matmul(xq, keys)
 
-        xq_shards = ark.sharding(xq, 2, 1)
-        keys_shards = ark.sharding(keys, 2, 1)
+        xq_shards = ark.sharding(xq, axis=2, dim_per_shard=1)
+        keys_shards = ark.sharding(keys, axis=2, dim_per_shard=1)
         scores = ark.tensor([bsz, self.n_local_heads, seqlen, seqlen], dtype=self.dtype)
-        scores_shards = ark.sharding(scores, 1, 1)
+        scores_shards = ark.sharding(scores, axis=1, dim_per_shard=1)
         results = []
         with Context(
             warp_range=[0, 8],
@@ -608,7 +528,7 @@ def forward(
                 "Tile": [256, 128],
             },
         ):
-            for i in range(self.n_local_heads):
+            for i in range(len(scores_shards)):
                 xq_shard_reshaped = ark.reshape(xq_shards[i], [bsz, 1, seqlen, self.head_dim])
                 keys_shard_reshaped = ark.reshape(keys_shards[i], [bsz, 1, seqlen, self.head_dim])
                 scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
@@ -642,10 +562,29 @@ def softmax(scores):
 
         scores = softmax(scores)
 
-        output = ark.matmul(
-            scores, values
-        )  # (bs, n_local_heads, seqlen, head_dim)
-        output = ark.transpose(output, [0, 2, 1, 3])
+        scores_shards = ark.sharding(scores, axis=1, dim_per_shard=1)
+        values_shards = ark.sharding(values, axis=2, dim_per_shard=1)
+        output = ark.tensor([bsz, seqlen, self.n_local_heads, self.head_dim], dtype=self.dtype)
+        output_shards = ark.sharding(output, axis=2, dim_per_shard=1)
+
+        results = []
+        with Context(
+            warp_range=[0, 4],
+            sram_range=[0, 24672],
+            sync=False,
+            config={
+                "NumWarps": 4,
+                "SramBytes": 24672,
+                "Tile": [256, 128],
+            },
+        ):
+            for i in range(len(output_shards)):
+                values_shard_reshaped = ark.reshape(values_shards[i], [bsz, 1, seqlen, self.head_dim])
+                scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
+                output_shard_reshaped = ark.reshape(output_shards[i], [bsz, 1, seqlen, self.head_dim])
+                res = ark.matmul(scores_shard_reshaped, values_shard_reshaped, output_shard_reshaped)
+                results.append(res)
+            output = ark.identity(output, deps=results)
         output = ark.reshape(
             output, [bsz, seqlen, self.head_dim * self.n_local_heads]
         )

From dfae17bab9091a3ac9ef2715ada276c1d179381a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 27 Sep 2024 03:00:05 +0000
Subject: [PATCH 089/106] A few fixes

---
 ark/api/context.cpp                 |  4 ++++
 ark/api/planner.cpp                 |  2 +-
 ark/context_impl.cpp                |  4 ++++
 ark/context_impl.hpp                |  2 ++
 ark/include/ark/context.hpp         |  7 +++++--
 ark/model/model_context_manager.cpp |  4 ++++
 ark/model/model_context_manager.hpp |  2 ++
 ark/model/model_graph_impl.cpp      | 11 +++++++++++
 ark/model/model_graph_impl.hpp      |  2 ++
 ark/model/model_json.cpp            |  9 ++++++++-
 examples/llama/model.py             | 28 +++-------------------------
 python/ark/planner.py               |  9 +++++++++
 python/planner_py.cpp               |  4 +++-
 13 files changed, 58 insertions(+), 30 deletions(-)

diff --git a/ark/api/context.cpp b/ark/api/context.cpp
index 76baedc87..702247ddf 100644
--- a/ark/api/context.cpp
+++ b/ark/api/context.cpp
@@ -29,4 +29,8 @@ void Context::set(const std::string& key, const std::string& value,
     this->impl_->set(key, value_json, type);
 }
 
+std::string Context::dump() const {
+    return this->impl_->dump().dump();
+}
+
 }  // namespace ark
diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 1c117af9a..506dcaff0 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -17,7 +17,7 @@
 namespace ark {
 
 PlannerContext::PlannerContext(Model &model) : Context(model) {
-    this->impl_->set("Id", this->id(), ContextType::Immutable);
+    this->impl_->set("Id", this->id(), ContextType::Overwrite);
 }
 
 void PlannerContext::check_range(const std::string &key,
diff --git a/ark/context_impl.cpp b/ark/context_impl.cpp
index 9a2692ea8..c4f95f2c3 100644
--- a/ark/context_impl.cpp
+++ b/ark/context_impl.cpp
@@ -52,4 +52,8 @@ bool Context::Impl::has(const std::string& key) const {
     return context_manager_->has(key);
 }
 
+Json Context::Impl::dump() const {
+    return context_manager_->dump();
+}
+
 }  // namespace ark
diff --git a/ark/context_impl.hpp b/ark/context_impl.hpp
index 1a77891b9..73fcae922 100644
--- a/ark/context_impl.hpp
+++ b/ark/context_impl.hpp
@@ -21,6 +21,8 @@ class Context::Impl {
 
     bool has(const std::string& key) const;
 
+    Json dump() const;
+
    protected:
     friend class Context;
 
diff --git a/ark/include/ark/context.hpp b/ark/include/ark/context.hpp
index f3eef2836..aaa22bd3a 100644
--- a/ark/include/ark/context.hpp
+++ b/ark/include/ark/context.hpp
@@ -17,9 +17,9 @@ enum class ContextType {
 class Context {
    public:
     ///
-    /// Construct an empty context for the given model.
+    /// Context handler of the given model.
     ///
-    /// @param model The model to create the context for.
+    /// @param model The model to manipulate the context for.
     ///
     Context(Model& model);
 
@@ -78,6 +78,9 @@ class Context {
     void set(const std::string& key, const std::string& value,
              ContextType type = ContextType::Overwrite);
 
+    /// Return the entire context stacks as a JSON format string.
+    std::string dump() const;
+
    protected:
     friend class PlannerContext;
 
diff --git a/ark/model/model_context_manager.cpp b/ark/model/model_context_manager.cpp
index f1bb62e9d..799cce785 100644
--- a/ark/model/model_context_manager.cpp
+++ b/ark/model/model_context_manager.cpp
@@ -27,4 +27,8 @@ Json ModelContextManager::get(const std::string& key) const {
     return context_stack_->get(key);
 }
 
+Json ModelContextManager::dump() const {
+    return context_stack_->dump();
+}
+
 }  // namespace ark
diff --git a/ark/model/model_context_manager.hpp b/ark/model/model_context_manager.hpp
index 6aa91692e..4dc246fe8 100644
--- a/ark/model/model_context_manager.hpp
+++ b/ark/model/model_context_manager.hpp
@@ -24,6 +24,8 @@ class ModelContextManager {
 
     Json get(const std::string& key) const;
 
+    Json dump() const;
+
    private:
     std::shared_ptr<ModelGraphContextStack> context_stack_;
     std::vector<std::string> keys_;
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index 7c1ea3fb5..b7717ecd3 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -62,6 +62,17 @@ std::map<std::string, Json> ModelGraphContextStack::get_all() const {
     return cur;
 }
 
+Json ModelGraphContextStack::dump() const {
+    Json j;
+    for (const auto &pair : this->storage_) {
+        j[pair.first] = Json::array();
+        for (const auto &value : pair.second) {
+            j[pair.first].emplace_back(*value);
+        }
+    }
+    return j;
+}
+
 ModelGraph::Impl::Impl(const ModelGraph::Impl &other) { *this = other; }
 
 ModelGraph::Impl &ModelGraph::Impl::operator=(const ModelGraph::Impl &other) {
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index 62944f999..5cd60d032 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -39,6 +39,8 @@ class ModelGraphContextStack {
     Json get(const std::string &key) const;
 
     std::map<std::string, Json> get_all() const;
+
+    Json dump() const;
 };
 
 class ModelGraph::Impl {
diff --git a/ark/model/model_json.cpp b/ark/model/model_json.cpp
index dad62cb4e..31fb24d51 100644
--- a/ark/model/model_json.cpp
+++ b/ark/model/model_json.cpp
@@ -302,9 +302,16 @@ static void verify_format_plan(const Json &json) {
                                                       "NumWarpsPerProcessor",
                                                       "TaskInfos",
                                                       "ProcessorGroups"};
+    if (!json.is_object()) {
+        std::string dumped = json.dump();
+        if (dumped.size() > 100) {
+            dumped = dumped.substr(0, 100) + "...";
+        }
+        ERR(PlanError, "Plan should be a JSON object. Given: ", dumped);
+    }
     for (const auto &field : required_fields) {
         if (!json.contains(field)) {
-            ERR(PlanError, field + " not found");
+            ERR(PlanError, field, " not found");
         }
     }
     if (!json.at("TaskInfos").is_array()) {
diff --git a/examples/llama/model.py b/examples/llama/model.py
index 3d18190b2..f80d68e55 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -104,15 +104,9 @@ def forward(self, x):
                 x2 = ark.mul(x, x)
             with Context(config={"Tile": [1], "ImplType": "WarpWise"}):
                 mean = ark.reduce_mean(x2, axis=-1)
-        with Context(
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "Tile": [64, 1],
-            }
-        ):
-            mean = ark.add(mean, self.eps)
-            rrms = ark.rsqrt(mean)
+                mean = ark.add(mean, self.eps)
+                rrms = ark.rsqrt(mean)
+
         with Context(
             warp_range=[0, 8],
             sync=False,
@@ -307,22 +301,6 @@ def forward(self, x):
         return ark.matmul(x, self.weight, transpose_other=True)
 
 
-# def tester(ref_func):
-#     def decorator(func):
-#         def wrapper(*args, **kwargs):
-#             data = []
-#             kdata = {}
-#             for arg in args:
-#                 if isinstance(arg, ark.Tensor):
-#                     rand_data =
-#             ref_outputs = ref_func(*args, **kwargs)
-#             outputs = func(*args, **kwargs)
-#             return outputs
-
-#         return wrapper
-#     return decorator
-
-
 class Silu(ark.Module):
     """
     Silu activation function, silu(x) = x * sigmoid(x)
diff --git a/python/ark/planner.py b/python/ark/planner.py
index 3c82719be..59de7a61c 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -195,6 +195,15 @@ def __init__(self, **kwargs):
         if config is not None:
             self.config(json.dumps(config))
 
+    def dump(self) -> str:
+        """
+        Dump the context stack.
+
+        Returns:
+            str: The context stack in JSON format.
+        """
+        return super().dump()
+
     def __enter__(self) -> "PlannerContext":
         """
         Enter the plan manager.
diff --git a/python/planner_py.cpp b/python/planner_py.cpp
index f0af0fa35..b43a8fdd8 100644
--- a/python/planner_py.cpp
+++ b/python/planner_py.cpp
@@ -13,6 +13,7 @@ namespace py = pybind11;
 void register_planner(py::module &m) {
     py::class_<ark::PlannerContext>(m, "CorePlannerContext")
         .def(py::init<ark::Model &>())
+        .def("id", &ark::PlannerContext::id)
         .def("processor_range", &ark::PlannerContext::processor_range,
              py::arg("start"), py::arg("end"), py::arg("step") = 1)
         .def("warp_range", &ark::PlannerContext::warp_range, py::arg("start"),
@@ -20,7 +21,8 @@ void register_planner(py::module &m) {
         .def("sram_range", &ark::PlannerContext::sram_range, py::arg("start"),
              py::arg("end"), py::arg("step") = 1)
         .def("sync", &ark::PlannerContext::sync, py::arg("sync"))
-        .def("config", &ark::PlannerContext::config, py::arg("config"));
+        .def("config", &ark::PlannerContext::config, py::arg("config"))
+        .def("dump", &ark::PlannerContext::dump);
 
     py::class_<ark::Planner>(m, "CorePlanner")
         .def(py::init<const ark::Model &, int>())

From 97ab2dfc1ec59d29b13d283b325a79d14cb1ada5 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 27 Sep 2024 04:26:56 +0000
Subject: [PATCH 090/106] multiple resource groups

---
 ark/api/planner.cpp            | 54 +++++++++++++++++--------
 ark/model/model_graph_impl.cpp | 14 +------
 ark/model/model_graph_impl.hpp |  2 -
 ark/model/model_node.hpp       |  2 +-
 examples/llama/model.py        | 72 ++++++++++++++++++----------------
 5 files changed, 78 insertions(+), 66 deletions(-)

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 506dcaff0..56e0b5b0c 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -138,17 +138,25 @@ std::string Planner::Impl::plan(bool pretty) const {
 
     auto get_context = [&](const ModelNodeRef &node,
                            const std::string &key) -> Json {
-        if (node->context.find(key) != node->context.end()) {
+        try {
             return node->context.at(key);
+        } catch (const Json::out_of_range &e) {
         }
         return Json();
     };
 
+    auto get_latest_context = [&](const ModelNodeRef &node,
+                                  const std::string &key) -> Json {
+        auto ctx = get_context(node, key);
+        if (ctx.empty()) return Json();
+        return ctx.back();
+    };
+
     for (const auto &node : model_.nodes()) {
         const auto &op = node->op;
         if (op->is_virtual()) continue;
 
-        auto ctx_config = get_context(node, "Config");
+        auto ctx_config = get_latest_context(node, "Config");
 
         Json config;
         if (!ctx_config.empty()) {
@@ -223,8 +231,8 @@ std::string Planner::Impl::plan(bool pretty) const {
         }
 
         size_t granularity = config.value("Granularity", 1);
-        auto ctx_id = get_context(node, "Id");
-        auto ctx_sync = get_context(node, "Sync");
+        auto ctx_id = get_latest_context(node, "Id");
+        auto ctx_sync = get_latest_context(node, "Sync");
         int id = ctx_id.empty() ? -1 : ctx_id.get<int>();
         bool sync = ctx_sync.empty() ? true : ctx_sync.get<bool>();
         if (id == prev_ctx_id && !sync) {
@@ -245,24 +253,31 @@ std::string Planner::Impl::plan(bool pretty) const {
             task_info["Ops"][0]["Config"] = config;
             task_infos.push_back(task_info);
 
-            auto ctx_processor_range = get_context(node, "ProcessorRange");
-            auto ctx_warp_range = get_context(node, "WarpRange");
-            auto ctx_sram_range = get_context(node, "SramRange");
+            auto ctx_processor_range_list = get_context(node, "ProcessorRange");
+            auto ctx_warp_range = get_latest_context(node, "WarpRange");
+            auto ctx_sram_range = get_latest_context(node, "SramRange");
 
             Json processor_group;
-            if (!ctx_processor_range.empty()) {
+            Json resource_group;
+            bool new_processor_group = true;
+            if (ctx_processor_range_list.empty()) {
+                size_t num_processors = std::min(num_sm, num_tasks);
+                processor_group["ProcessorRange"] = {0, num_processors};
+                resource_group["ProcessorRange"] = {0, num_processors};
+                max_processor_id = std::max(max_processor_id, num_processors);
+            } else if (ctx_processor_range_list.size() == 1 ||
+                       (id != prev_ctx_id)) {
+                auto &ctx_processor_range = ctx_processor_range_list[0];
                 processor_group["ProcessorRange"] = ctx_processor_range;
+                resource_group["ProcessorRange"] = ctx_processor_range;
                 max_processor_id = std::max(
                     max_processor_id, ctx_processor_range[1].get<size_t>());
             } else {
-                size_t num_processors = std::min(num_sm, num_tasks);
-                processor_group["ProcessorRange"] = {0, num_processors};
-                max_processor_id = std::max(max_processor_id, num_processors);
+                new_processor_group = false;
+                resource_group["ProcessorRange"] =
+                    ctx_processor_range_list.back();
             }
 
-            Json resource_group;
-            resource_group["ProcessorRange"] =
-                processor_group["ProcessorRange"];
             if (!ctx_warp_range.empty()) {
                 resource_group["WarpRange"] = ctx_warp_range;
                 max_warp_id =
@@ -280,9 +295,14 @@ std::string Planner::Impl::plan(bool pretty) const {
                                              {"TaskRange", {0, num_tasks}},
                                              {"Granularity", granularity}}};
 
-            processor_group["ResourceGroups"] = Json::array();
-            processor_group["ResourceGroups"].push_back(resource_group);
-            processor_groups.push_back(processor_group);
+            if (new_processor_group) {
+                processor_group["ResourceGroups"] = Json::array();
+                processor_group["ResourceGroups"].push_back(resource_group);
+                processor_groups.push_back(processor_group);
+            } else {
+                processor_groups.back()["ResourceGroups"].push_back(
+                    resource_group);
+            }
         }
         prev_ctx_id = id;
         first_op = false;
diff --git a/ark/model/model_graph_impl.cpp b/ark/model/model_graph_impl.cpp
index b7717ecd3..7c72a7dd2 100644
--- a/ark/model/model_graph_impl.cpp
+++ b/ark/model/model_graph_impl.cpp
@@ -52,18 +52,8 @@ Json ModelGraphContextStack::get(const std::string &key) const {
     return Json();
 }
 
-std::map<std::string, Json> ModelGraphContextStack::get_all() const {
-    std::map<std::string, Json> cur;
-    for (const auto &pair : this->storage_) {
-        if (!pair.second.empty()) {
-            cur[pair.first] = *pair.second.back();
-        }
-    }
-    return cur;
-}
-
 Json ModelGraphContextStack::dump() const {
-    Json j;
+    Json j = Json::object();
     for (const auto &pair : this->storage_) {
         j[pair.first] = Json::array();
         for (const auto &value : pair.second) {
@@ -227,7 +217,7 @@ ModelNodeRef ModelGraph::Impl::add_op(ModelOpRef op) {
         producer->consumers.push_back(node);
     }
 
-    node->context = context_stack_->get_all();
+    node->context = context_stack_->dump();
 
     nodes_.push_back(node);
     return node;
diff --git a/ark/model/model_graph_impl.hpp b/ark/model/model_graph_impl.hpp
index 5cd60d032..b9646d057 100644
--- a/ark/model/model_graph_impl.hpp
+++ b/ark/model/model_graph_impl.hpp
@@ -38,8 +38,6 @@ class ModelGraphContextStack {
 
     Json get(const std::string &key) const;
 
-    std::map<std::string, Json> get_all() const;
-
     Json dump() const;
 };
 
diff --git a/ark/model/model_node.hpp b/ark/model/model_node.hpp
index ca97f4540..437875676 100644
--- a/ark/model/model_node.hpp
+++ b/ark/model/model_node.hpp
@@ -28,7 +28,7 @@ class ModelNode {
     UniqueList<ModelNodeRef> producers;
 
     /// Graph context of this node.
-    std::map<std::string, Json> context;
+    Json context;
 };
 
 }  // namespace ark
diff --git a/examples/llama/model.py b/examples/llama/model.py
index f80d68e55..57ff7d9b1 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -495,27 +495,29 @@ def forward(
         scores = ark.tensor([bsz, self.n_local_heads, seqlen, seqlen], dtype=self.dtype)
         scores_shards = ark.sharding(scores, axis=1, dim_per_shard=1)
         results = []
-        with Context(
-            warp_range=[0, 8],
-            sram_range=[0, 49344],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "Granularity": 2,
-                "SramBytes": 24672,
-                "Tile": [256, 128],
-            },
-        ):
+        with Context(processor_range=[0, 304]):
             for i in range(len(scores_shards)):
-                xq_shard_reshaped = ark.reshape(xq_shards[i], [bsz, 1, seqlen, self.head_dim])
-                keys_shard_reshaped = ark.reshape(keys_shards[i], [bsz, 1, seqlen, self.head_dim])
-                scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
-                res = ark.matmul(xq_shard_reshaped, keys_shard_reshaped, scores_shard_reshaped, transpose_other=True)
-                res = ark.mul(res, 1.0 / math.sqrt(self.head_dim), res)
-                if mask is not None:
-                    res = ark.add(res, mask, res)
+                with Context(
+                    processor_range=[i*8, (i+1)*8],
+                    warp_range=[0, 8],
+                    sram_range=[0, 49344],
+                    sync=False,
+                    config={
+                        "NumWarps": 4,
+                        "Granularity": 2,
+                        "SramBytes": 24672,
+                        "Tile": [256, 128],
+                    },
+                ):
+                    xq_shard_reshaped = ark.reshape(xq_shards[i], [bsz, 1, seqlen, self.head_dim])
+                    keys_shard_reshaped = ark.reshape(keys_shards[i], [bsz, 1, seqlen, self.head_dim])
+                    scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
+                    res = ark.matmul(xq_shard_reshaped, keys_shard_reshaped, scores_shard_reshaped, transpose_other=True)
+                    res = ark.mul(res, 1.0 / math.sqrt(self.head_dim), res)
+                    if mask is not None:
+                        res = ark.add(res, mask, res)
                 results.append(res)
-            scores = ark.identity(scores, deps=results)
+        scores = ark.identity(scores, deps=results)
 
         def softmax(scores):
             with Context(
@@ -546,22 +548,24 @@ def softmax(scores):
         output_shards = ark.sharding(output, axis=2, dim_per_shard=1)
 
         results = []
-        with Context(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={
-                "NumWarps": 4,
-                "SramBytes": 24672,
-                "Tile": [256, 128],
-            },
-        ):
+        with Context(processor_range=[0, 304]):
             for i in range(len(output_shards)):
-                values_shard_reshaped = ark.reshape(values_shards[i], [bsz, 1, seqlen, self.head_dim])
-                scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
-                output_shard_reshaped = ark.reshape(output_shards[i], [bsz, 1, seqlen, self.head_dim])
-                res = ark.matmul(scores_shard_reshaped, values_shard_reshaped, output_shard_reshaped)
-                results.append(res)
+                with Context(
+                    processor_range=[i*8, (i+1)*8],
+                    warp_range=[0, 4],
+                    sram_range=[0, 24672],
+                    sync=False,
+                    config={
+                        "NumWarps": 4,
+                        "SramBytes": 24672,
+                        "Tile": [256, 128],
+                    },
+                ):
+                    values_shard_reshaped = ark.reshape(values_shards[i], [bsz, 1, seqlen, self.head_dim])
+                    scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
+                    output_shard_reshaped = ark.reshape(output_shards[i], [bsz, 1, seqlen, self.head_dim])
+                    res = ark.matmul(scores_shard_reshaped, values_shard_reshaped, output_shard_reshaped)
+                    results.append(res)
             output = ark.identity(output, deps=results)
         output = ark.reshape(
             output, [bsz, seqlen, self.head_dim * self.n_local_heads]

From 2a0e1a0fc3b09b60d40c3833b6c5551d1fcfe8aa Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 27 Sep 2024 09:21:41 +0000
Subject: [PATCH 091/106] fix

---
 ark/api/planner.cpp             | 58 +++++++++++++++++++++------------
 python/ark/planner.py           |  4 ++-
 python/unittest/test_planner.py | 40 +++++++++++++++++++++++
 3 files changed, 81 insertions(+), 21 deletions(-)
 create mode 100644 python/unittest/test_planner.py

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 56e0b5b0c..8bf8c2f97 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -18,6 +18,7 @@ namespace ark {
 
 PlannerContext::PlannerContext(Model &model) : Context(model) {
     this->impl_->set("Id", this->id(), ContextType::Overwrite);
+    this->impl_->set("Sync", true, ContextType::Overwrite);
 }
 
 void PlannerContext::check_range(const std::string &key,
@@ -71,16 +72,9 @@ void PlannerContext::sram_range(int start, int end, int step) {
 }
 
 void PlannerContext::sync(bool sync) {
-    if (sync) {
-        // `true` should not overwrite `false`.
-        if (this->impl_->get("Sync") == Json(false)) {
-            LOG(WARN, "Ignoring sync(true) while sync(false) is already set");
-            return;
-        }
-        this->impl_->set("Sync", true, ContextType::Immutable);
-    } else {
-        this->impl_->set("Sync", false, ContextType::Overwrite);
-    }
+    // Sync should be always pushed with Id together.
+    this->impl_->set("Id", this->id(), ContextType::Overwrite);
+    this->impl_->set("Sync", sync, ContextType::Overwrite);
 }
 
 void PlannerContext::config(const std::string &config) {
@@ -133,7 +127,7 @@ std::string Planner::Impl::plan(bool pretty) const {
     size_t max_processor_id = 1;
     size_t max_warp_id = 1;
     size_t next_task_id = 0;
-    int prev_ctx_id = -1;
+    int merge_root = -1;
     bool first_op = true;
 
     auto get_context = [&](const ModelNodeRef &node,
@@ -142,7 +136,7 @@ std::string Planner::Impl::plan(bool pretty) const {
             return node->context.at(key);
         } catch (const Json::out_of_range &e) {
         }
-        return Json();
+        return Json::array();
     };
 
     auto get_latest_context = [&](const ModelNodeRef &node,
@@ -231,11 +225,36 @@ std::string Planner::Impl::plan(bool pretty) const {
         }
 
         size_t granularity = config.value("Granularity", 1);
-        auto ctx_id = get_latest_context(node, "Id");
-        auto ctx_sync = get_latest_context(node, "Sync");
-        int id = ctx_id.empty() ? -1 : ctx_id.get<int>();
-        bool sync = ctx_sync.empty() ? true : ctx_sync.get<bool>();
-        if (id == prev_ctx_id && !sync) {
+        auto ctx_id_list = get_context(node, "Id").get<std::vector<int>>();
+        auto ctx_sync_list = get_context(node, "Sync").get<std::vector<bool>>();
+        if (merge_root != -1) {
+            bool not_found = true;
+            for (auto ctx_id : ctx_id_list) {
+                if (ctx_id == merge_root) {
+                    not_found = false;
+                    break;
+                }
+            }
+            if (not_found) {
+                merge_root = -1;
+            }
+        }
+        bool merge_this_node = (merge_root != -1);
+        if (merge_root == -1) {
+            size_t idx = 0;
+            for (; idx < ctx_sync_list.size(); idx++) {
+                if (!ctx_sync_list[idx]) {
+                    if (ctx_id_list.size() <= idx) {
+                        ERR(InternalError,
+                            "ctx_id_list should have the same size as "
+                            "ctx_sync_list");
+                    }
+                    merge_root = ctx_id_list[idx];
+                    break;
+                }
+            }
+        }
+        if (merge_this_node) {
             auto &task_info = task_infos.back();
             task_info["NumWarps"] =
                 std::max(task_info["NumWarps"].get<size_t>(), num_warps);
@@ -266,8 +285,8 @@ std::string Planner::Impl::plan(bool pretty) const {
                 resource_group["ProcessorRange"] = {0, num_processors};
                 max_processor_id = std::max(max_processor_id, num_processors);
             } else if (ctx_processor_range_list.size() == 1 ||
-                       (id != prev_ctx_id)) {
-                auto &ctx_processor_range = ctx_processor_range_list[0];
+                       !merge_this_node) {
+                auto &ctx_processor_range = ctx_processor_range_list.back();
                 processor_group["ProcessorRange"] = ctx_processor_range;
                 resource_group["ProcessorRange"] = ctx_processor_range;
                 max_processor_id = std::max(
@@ -304,7 +323,6 @@ std::string Planner::Impl::plan(bool pretty) const {
                     resource_group);
             }
         }
-        prev_ctx_id = id;
         first_op = false;
     }
 
diff --git a/python/ark/planner.py b/python/ark/planner.py
index 59de7a61c..0fdbe6c53 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -184,6 +184,8 @@ def __init__(self, **kwargs):
         sync: bool = kwargs.get("sync", True)
         config: Dict[str, Any] = kwargs.get("config", None)
 
+        print(f"ctx id = {super().id()}")
+
         if prange is not None:
             self.processor_range(*prange)
         if wrange is not None:
@@ -236,4 +238,4 @@ def plan(self) -> Plan:
         """
         Generate an execution plan.
         """
-        return Plan.from_str(super().plan(pretty=False))
+        return Plan.from_str(super().plan(pretty=True))
diff --git a/python/unittest/test_planner.py b/python/unittest/test_planner.py
new file mode 100644
index 000000000..94ad3ca40
--- /dev/null
+++ b/python/unittest/test_planner.py
@@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from common import ark, pytest_ark
+
+
+@pytest_ark()
+def test_planner_processor_range():
+    input_tensor = ark.tensor([64, 64], ark.fp16)
+    other_tensor = ark.tensor([64, 64], ark.fp16)
+
+    with ark.PlannerContext(processor_range=[0, 128]):
+        with ark.PlannerContext(processor_range=[0, 8], sync=False):
+            ark.add(input_tensor, other_tensor)
+        with ark.PlannerContext(processor_range=[8, 16], sync=False):
+            ark.add(input_tensor, other_tensor)
+
+    plan = ark.Planner().plan()
+
+    pg = plan.processor_groups
+    assert len(pg) == 2
+    assert pg[0]["ProcessorRange"] == [0, 8]
+    assert pg[1]["ProcessorRange"] == [8, 16]
+
+
+@pytest_ark()
+def test_planner_sync():
+    input_tensor = ark.tensor([64, 64], ark.fp16)
+    other_tensor = ark.tensor([64, 64], ark.fp16)
+
+    with ark.PlannerContext(sync=False):
+        with ark.PlannerContext():
+            ark.add(input_tensor, other_tensor)
+        with ark.PlannerContext():
+            ark.add(input_tensor, other_tensor)
+
+    plan = ark.Planner().plan()
+
+    pg = plan.processor_groups
+    assert len(pg) == 1

From dae1bb2afb09f3ccfaac93eb805984fe99325474 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 27 Sep 2024 11:50:49 +0000
Subject: [PATCH 092/106] fix

---
 ark/api/planner.cpp             | 114 ++++++++++++++++++++------------
 ark/context_impl.hpp            |   2 +-
 examples/llama/model.py         |  31 +++------
 python/ark/profiler.py          |  53 ++++++++++++++-
 python/unittest/test_planner.py |   6 +-
 5 files changed, 136 insertions(+), 70 deletions(-)

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 8bf8c2f97..54bc1f891 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -17,8 +17,11 @@
 namespace ark {
 
 PlannerContext::PlannerContext(Model &model) : Context(model) {
-    this->impl_->set("Id", this->id(), ContextType::Overwrite);
-    this->impl_->set("Sync", true, ContextType::Overwrite);
+    this->impl_->set("Id", id());
+    Json val;
+    val.push_back(id());
+    val.push_back(true);
+    this->impl_->set("Sync", val);
 }
 
 void PlannerContext::check_range(const std::string &key,
@@ -28,7 +31,7 @@ void PlannerContext::check_range(const std::string &key,
         // ok
         return;
     }
-    auto prev_vec = prev.get<std::vector<int>>();
+    auto prev_vec = prev[1].get<std::vector<int>>();
     if (prev_vec.size() < 2 || prev_vec.size() > 3) {
         ERR(InternalError, "unexpected");
     }
@@ -42,43 +45,56 @@ void PlannerContext::check_range(const std::string &key,
 
 void PlannerContext::processor_range(int start, int end, int step) {
     check_range("ProcessorRange", {start, end, step});
+    Json val;
+    val.push_back(id());
     if (step == 1) {
-        this->impl_->set("ProcessorRange", {start, end},
-                         ContextType::Overwrite);
+        val.push_back({start, end});
+        this->impl_->set("ProcessorRange", {id(), {start, end}});
     } else {
-        this->impl_->set("ProcessorRange", {start, end, step},
-                         ContextType::Overwrite);
+        val.push_back({start, end, step});
+        this->impl_->set("ProcessorRange", {id(), {start, end, step}});
     }
 }
 
 void PlannerContext::warp_range(int start, int end, int step) {
     check_range("WarpRange", {start, end, step});
+    Json val;
+    val.push_back(id());
     if (step == 1) {
-        this->impl_->set("WarpRange", {start, end}, ContextType::Overwrite);
+        val.push_back({start, end});
+        this->impl_->set("WarpRange", {id(), {start, end}});
     } else {
-        this->impl_->set("WarpRange", {start, end, step},
-                         ContextType::Overwrite);
+        val.push_back({start, end, step});
+        this->impl_->set("WarpRange", {id(), {start, end, step}});
     }
 }
 
 void PlannerContext::sram_range(int start, int end, int step) {
     check_range("SramRange", {start, end, step});
+    Json val;
+    val.push_back(id());
     if (step == 1) {
-        this->impl_->set("SramRange", {start, end}, ContextType::Overwrite);
+        val.push_back({start, end});
+        this->impl_->set("SramRange", {id(), {start, end}});
     } else {
-        this->impl_->set("SramRange", {start, end, step},
-                         ContextType::Overwrite);
+        val.push_back({start, end, step});
+        this->impl_->set("SramRange", {id(), {start, end, step}});
     }
 }
 
 void PlannerContext::sync(bool sync) {
     // Sync should be always pushed with Id together.
-    this->impl_->set("Id", this->id(), ContextType::Overwrite);
-    this->impl_->set("Sync", sync, ContextType::Overwrite);
+    Json val;
+    val.push_back(id());
+    val.push_back(sync);
+    this->impl_->set("Sync", val);
 }
 
 void PlannerContext::config(const std::string &config) {
-    this->impl_->set("Config", Json::parse(config), ContextType::Extend);
+    Json val;
+    val.push_back(id());
+    val.push_back(Json::parse(config));
+    this->impl_->set("Config", val);
 }
 
 class Planner::Impl {
@@ -128,6 +144,7 @@ std::string Planner::Impl::plan(bool pretty) const {
     size_t max_warp_id = 1;
     size_t next_task_id = 0;
     int merge_root = -1;
+    int processor_group_root = -1;
     bool first_op = true;
 
     auto get_context = [&](const ModelNodeRef &node,
@@ -150,12 +167,15 @@ std::string Planner::Impl::plan(bool pretty) const {
         const auto &op = node->op;
         if (op->is_virtual()) continue;
 
-        auto ctx_config = get_latest_context(node, "Config");
-
-        Json config;
-        if (!ctx_config.empty()) {
-            config = ctx_config;
-        } else if (!config_rules_.empty()) {
+        Json config = Json::object();
+        for (auto &obj : get_context(node, "Config")) {
+            LOG(INFO, obj.dump());
+            auto &items = obj[1];
+            for (auto &item : items.items()) {
+                config[item.key()] = item.value();
+            }
+        }
+        if (config.empty() && !config_rules_.empty()) {
             const std::string op_str = op->serialize().dump();
             for (auto &rule : config_rules_) {
                 auto config_str = rule(op_str, gpu_info.arch->name());
@@ -225,8 +245,8 @@ std::string Planner::Impl::plan(bool pretty) const {
         }
 
         size_t granularity = config.value("Granularity", 1);
-        auto ctx_id_list = get_context(node, "Id").get<std::vector<int>>();
-        auto ctx_sync_list = get_context(node, "Sync").get<std::vector<bool>>();
+        auto ctx_id_list = get_context(node, "Id");
+        auto ctx_sync_list = get_context(node, "Sync");
         if (merge_root != -1) {
             bool not_found = true;
             for (auto ctx_id : ctx_id_list) {
@@ -241,15 +261,11 @@ std::string Planner::Impl::plan(bool pretty) const {
         }
         bool merge_this_node = (merge_root != -1);
         if (merge_root == -1) {
-            size_t idx = 0;
-            for (; idx < ctx_sync_list.size(); idx++) {
-                if (!ctx_sync_list[idx]) {
-                    if (ctx_id_list.size() <= idx) {
-                        ERR(InternalError,
-                            "ctx_id_list should have the same size as "
-                            "ctx_sync_list");
-                    }
-                    merge_root = ctx_id_list[idx];
+            for (auto &item : ctx_sync_list) {
+                auto &ctx_id = item[0];
+                auto &sync = item[1];
+                if (!sync) {
+                    merge_root = ctx_id;
                     break;
                 }
             }
@@ -279,34 +295,46 @@ std::string Planner::Impl::plan(bool pretty) const {
             Json processor_group;
             Json resource_group;
             bool new_processor_group = true;
+            bool id_found = false;
+            for (auto &item : ctx_processor_range_list) {
+                if (item[0] == processor_group_root) {
+                    id_found = true;
+                    break;
+                }
+            }
+            if (!id_found) {
+                processor_group_root = -1;
+            }
+            if (ctx_processor_range_list.size() > 2) {
+                ERR(UnsupportedError, "ProcessorRange list size > 2");
+            }
             if (ctx_processor_range_list.empty()) {
                 size_t num_processors = std::min(num_sm, num_tasks);
                 processor_group["ProcessorRange"] = {0, num_processors};
                 resource_group["ProcessorRange"] = {0, num_processors};
                 max_processor_id = std::max(max_processor_id, num_processors);
-            } else if (ctx_processor_range_list.size() == 1 ||
-                       !merge_this_node) {
-                auto &ctx_processor_range = ctx_processor_range_list.back();
-                processor_group["ProcessorRange"] = ctx_processor_range;
-                resource_group["ProcessorRange"] = ctx_processor_range;
+            } else if (processor_group_root == -1) {
+                processor_group_root = ctx_processor_range_list.front()[0];
+                processor_group["ProcessorRange"] = ctx_processor_range_list.front()[1];
+                resource_group["ProcessorRange"] = ctx_processor_range_list.back()[1];
                 max_processor_id = std::max(
-                    max_processor_id, ctx_processor_range[1].get<size_t>());
+                    max_processor_id, ctx_processor_range_list.front()[1][1].get<size_t>());
             } else {
                 new_processor_group = false;
                 resource_group["ProcessorRange"] =
-                    ctx_processor_range_list.back();
+                    ctx_processor_range_list.back()[1];
             }
 
             if (!ctx_warp_range.empty()) {
-                resource_group["WarpRange"] = ctx_warp_range;
+                resource_group["WarpRange"] = ctx_warp_range[1];
                 max_warp_id =
-                    std::max(max_warp_id, ctx_warp_range[1].get<size_t>());
+                    std::max(max_warp_id, ctx_warp_range[1][1].get<size_t>());
             } else {
                 resource_group["WarpRange"] = {0, num_warps};
                 max_warp_id = std::max(max_warp_id, num_warps);
             }
             if (!ctx_sram_range.empty()) {
-                resource_group["SramRange"] = ctx_sram_range;
+                resource_group["SramRange"] = ctx_sram_range[1];
             } else {
                 resource_group["SramRange"] = {0, sram_bytes};
             }
diff --git a/ark/context_impl.hpp b/ark/context_impl.hpp
index 73fcae922..b79353296 100644
--- a/ark/context_impl.hpp
+++ b/ark/context_impl.hpp
@@ -17,7 +17,7 @@ class Context::Impl {
 
     Json get(const std::string& key) const;
 
-    void set(const std::string& key, const Json& value_json, ContextType type);
+    void set(const std::string& key, const Json& value_json, ContextType type = ContextType::Overwrite);
 
     bool has(const std::string& key) const;
 
diff --git a/examples/llama/model.py b/examples/llama/model.py
index 57ff7d9b1..b69bcf2f4 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -106,20 +106,10 @@ def forward(self, x):
                 mean = ark.reduce_mean(x2, axis=-1)
                 mean = ark.add(mean, self.eps)
                 rrms = ark.rsqrt(mean)
-
-        with Context(
-            warp_range=[0, 8],
-            sync=False,
-            config={
-                "NumWarps": 1,
-                "SramBytes": 0,
-                "Tile": [1, 4096],
-                "Granularity": 7,
-            },
-        ):
-            x = ark.mul(x, rrms)
-            x = ark.mul(x, self.weight, x)
-            return ark.cast(x, self.dtype)
+            with Context(config={"Tile": [1, 4096]}):
+                x = ark.mul(x, rrms)
+                x = ark.mul(x, self.weight, x)
+                return ark.cast(x, self.dtype)
 
 
 class ColumnParallelLinear(ark.Module):
@@ -668,10 +658,11 @@ def forward(
         freqs_cis: ark.Tensor,
         mask: Optional[ark.Tensor],
     ):
-        h = self.tok_embeddings(tokens)
+        with Context(warp_range=[0, 8]):
+            h = self.tok_embeddings(tokens)
 
-        for layer in self.layers:
-            h = layer(h, start_pos, freqs_cis, mask)
-        h = self.norm(h)
-        output = self.output(h)
-        return output
+            for layer in self.layers:
+                h = layer(h, start_pos, freqs_cis, mask)
+            h = self.norm(h)
+            output = self.output(h)
+            return output
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index e47f5b7aa..f3ed55042 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -3,6 +3,7 @@
 
 import sys
 import time
+from typing import Optional, List
 
 from .runtime import Runtime
 from .planner import Plan
@@ -26,10 +27,12 @@ def run(
         iter: int = 1000,
         loop_mode: bool = True,
         profile_processor_groups: bool = False,
+        target_processor_groups: Optional[List[int]] = None,
     ):
-        sys.stderr.write(
-            f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n"
-        )
+        if target_processor_groups is None:
+            sys.stderr.write(
+                f"End-to-end: {timeit(self.plan, iter, loop_mode):.6f} seconds/iter\n"
+            )
 
         if not profile_processor_groups:
             return
@@ -44,8 +47,52 @@ def run(
             "ProcessorGroups": [None],
         }
         for i in range(num_processor_groups):
+            if target_processor_groups is not None and i not in target_processor_groups:
+                continue
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
             lat_per_iter = timeit(Plan(new_plan), iter, loop_mode)
             sys.stderr.write(
                 f"Processor group {i}: {lat_per_iter:.6f} seconds/iter\n"
             )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(description="ARK Profiler")
+    parser.add_argument(
+        "--iter",
+        type=int,
+        default=1000,
+        help="Number of iterations to run for each measurement",
+    )
+    parser.add_argument(
+        "--loop_mode",
+        action="store_true",
+        help="Use loop mode to measure end-to-end latency",
+    )
+    parser.add_argument(
+        "--profile_processor_groups",
+        action="store_true",
+        help="Profile processor groups",
+    )
+    parser.add_argument(
+        "--target_processor_groups",
+        type=str,
+        help="Target processor groups to profile",
+    )
+    parser.add_argument("--plan", type=str, help="Path to the plan file", required=True)
+    args = parser.parse_args()
+
+    target_processor_groups = None
+    if args.target_processor_groups is not None:
+        target_processor_groups = list(map(int, args.target_processor_groups.split(",")))
+
+    plan = Plan.from_file(args.plan)
+    profiler = Profiler(plan)
+    profiler.run(
+        iter=args.iter,
+        loop_mode=args.loop_mode,
+        profile_processor_groups=args.profile_processor_groups,
+        target_processor_groups=target_processor_groups,
+    )
diff --git a/python/unittest/test_planner.py b/python/unittest/test_planner.py
index 94ad3ca40..0a739c714 100644
--- a/python/unittest/test_planner.py
+++ b/python/unittest/test_planner.py
@@ -18,9 +18,9 @@ def test_planner_processor_range():
     plan = ark.Planner().plan()
 
     pg = plan.processor_groups
-    assert len(pg) == 2
-    assert pg[0]["ProcessorRange"] == [0, 8]
-    assert pg[1]["ProcessorRange"] == [8, 16]
+    assert len(pg) == 1
+    assert pg[0]["ResourceGroups"][0]["ProcessorRange"] == [0, 8]
+    assert pg[0]["ResourceGroups"][1]["ProcessorRange"] == [8, 16]
 
 
 @pytest_ark()

From 73aee74a7f447a36336df3179be0ae2f1b947116 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Fri, 27 Sep 2024 12:05:28 +0000
Subject: [PATCH 093/106] update

---
 ark/api/planner.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 54bc1f891..24036b8f0 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -169,7 +169,6 @@ std::string Planner::Impl::plan(bool pretty) const {
 
         Json config = Json::object();
         for (auto &obj : get_context(node, "Config")) {
-            LOG(INFO, obj.dump());
             auto &items = obj[1];
             for (auto &item : items.items()) {
                 config[item.key()] = item.value();

From a711d3e0657020c1828cb1cfc5d936048de85564 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Sat, 28 Sep 2024 00:21:24 +0000
Subject: [PATCH 094/106] update

---
 python/ark/planner.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/ark/planner.py b/python/ark/planner.py
index 0fdbe6c53..4b2b7b919 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -184,8 +184,6 @@ def __init__(self, **kwargs):
         sync: bool = kwargs.get("sync", True)
         config: Dict[str, Any] = kwargs.get("config", None)
 
-        print(f"ctx id = {super().id()}")
-
         if prange is not None:
             self.processor_range(*prange)
         if wrange is not None:

From 9b03feb6731bb8dcdbc62fb9a5a40da2dbf1635c Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 02:27:52 +0000
Subject: [PATCH 095/106] updates

---
 ark/api/planner_test.cpp               | 120 +++++++----
 ark/include/kernels/common/broadcast.h |   8 +-
 ark/include/kernels/common/ewise.h     |   4 +-
 ark/include/kernels/gemm_cutlass.h     |   2 +
 ark/include/kernels/layernorm.h        |   3 +-
 ark/include/kernels/matmul.h           |  47 ++---
 ark/include/kernels/reduce.h           |   4 +-
 ark/ops/ops_matmul.cpp                 |  44 +++-
 examples/llama/model.py                | 268 +++++++++++++++----------
 examples/llama/model_test.py           |  63 ------
 python/ark/ops.py                      |   2 +
 python/ark/planner.py                  |  21 +-
 python/ark/profiler.py                 |  16 +-
 python/ark/tensor.py                   |  73 ++++++-
 python/model_py.cpp                    |   3 +
 python/unittest/test_tensor.py         |  20 ++
 16 files changed, 426 insertions(+), 272 deletions(-)

diff --git a/ark/api/planner_test.cpp b/ark/api/planner_test.cpp
index 011b25d8d..919ba2f1d 100644
--- a/ark/api/planner_test.cpp
+++ b/ark/api/planner_test.cpp
@@ -7,57 +7,93 @@
 #include "unittest/unittest_utils.h"
 
 ark::unittest::State test_planner_context_processor_range() {
-    ark::Model model;
-    ark::Tensor t0 = model.tensor({1}, ark::FP32);
-    ark::Tensor t1 = model.tensor({1}, ark::FP32);
-
-    // node 0
-    ark::Tensor t2 = model.add(t0, t1);
-
-    ark::Tensor t3;
-    ark::Tensor t4;
-    ark::Tensor t5;
     {
-        // node 1
-        ark::PlannerContext ctx(model);
-        ctx.processor_range(0, 4);
-        t3 = model.relu(t2);
-
-        UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({0, 4}).dump());
-
-        // node 2
-        ctx.processor_range(2, 4);
-        t4 = model.sqrt(t3);
-
-        UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({2, 4}).dump());
-
-        // Invalid usage: range (0, 4) is out of previous range (2, 4)
-        UNITTEST_THROW(ctx.processor_range(0, 4), ark::PlanError);
+        ark::Model model;
+        ark::Tensor t0 = model.tensor({1}, ark::FP32);
+        ark::Tensor t1 = model.tensor({1}, ark::FP32);
+
+        // node 0
+        ark::Tensor t2 = model.add(t0, t1);
+
+        ark::Tensor t3;
+        ark::Tensor t4;
+        ark::Tensor t5;
+        {
+            // node 1
+            ark::PlannerContext ctx(model);
+            ctx.processor_range(0, 4);
+            t3 = model.relu(t2);
+
+            UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({0, 4}).dump());
+
+            // node 2
+            ctx.processor_range(2, 4);
+            t4 = model.sqrt(t3);
+
+            UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({2, 4}).dump());
+
+            // Invalid usage: range (0, 4) is out of previous range (2, 4)
+            UNITTEST_THROW(ctx.processor_range(0, 4), ark::PlanError);
+        }
+        {
+            // node 3
+            ark::PlannerContext ctx(model);
+            ctx.processor_range(2, 6, 2);
+            t5 = model.exp(t2);
+
+            UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({2, 6, 2}).dump());
+        }
+
+        UNITTEST_TRUE(model.verify());
+
+        auto compressed = model.compress();
+        UNITTEST_TRUE(compressed.verify());
+
+        auto nodes = compressed.nodes();
+        UNITTEST_EQ(nodes.size(), 4);
+
+        UNITTEST_EQ(nodes[0]->context.size(), 0);
+        UNITTEST_GE(nodes[1]->context.size(), 1);
+        UNITTEST_EQ(nodes[1]->context.at("ProcessorRange"), ark::Json({0, 4}));
+        UNITTEST_GE(nodes[2]->context.size(), 1);
+        UNITTEST_EQ(nodes[2]->context.at("ProcessorRange"), ark::Json({2, 4}));
+        UNITTEST_GE(nodes[3]->context.size(), 1);
+        UNITTEST_EQ(nodes[3]->context.at("ProcessorRange"),
+                    ark::Json({2, 6, 2}));
     }
     {
-        // node 3
+        ark::Model model;
+        ark::Tensor t0 = model.tensor({1}, ark::FP32);
+        ark::Tensor t1 = model.tensor({1}, ark::FP32);
+
         ark::PlannerContext ctx(model);
-        ctx.processor_range(2, 6, 2);
-        t5 = model.exp(t2);
+        ctx.processor_range(0, 10);
 
-        UNITTEST_EQ(ctx.get("ProcessorRange"), ark::Json({2, 6, 2}).dump());
-    }
+        std::vector<ark::Tensor> tensors;
+        for (size_t i = 0; i < 5; ++i) {
+            ark::PlannerContext subctx(model);
+            subctx.processor_range(0 * i, 2 * i);
+            auto t = model.add(t0, t1);
+            tensors.push_back(t);
 
-    UNITTEST_TRUE(model.verify());
+            UNITTEST_EQ(ctx.get("ProcessorRange"),
+                        ark::Json({0 * i, 2 * i}).dump());
+        }
 
-    auto compressed = model.compress();
-    UNITTEST_TRUE(compressed.verify());
+        UNITTEST_TRUE(model.verify());
 
-    auto nodes = compressed.nodes();
-    UNITTEST_EQ(nodes.size(), 4);
+        auto compressed = model.compress();
+        UNITTEST_TRUE(compressed.verify());
 
-    UNITTEST_EQ(nodes[0]->context.size(), 0);
-    UNITTEST_GE(nodes[1]->context.size(), 1);
-    UNITTEST_EQ(nodes[1]->context.at("ProcessorRange"), ark::Json({0, 4}));
-    UNITTEST_GE(nodes[2]->context.size(), 1);
-    UNITTEST_EQ(nodes[2]->context.at("ProcessorRange"), ark::Json({2, 4}));
-    UNITTEST_GE(nodes[3]->context.size(), 1);
-    UNITTEST_EQ(nodes[3]->context.at("ProcessorRange"), ark::Json({2, 6, 2}));
+        auto nodes = compressed.nodes();
+        UNITTEST_EQ(nodes.size(), 5);
+
+        for (size_t i = 0; i < 5; ++i) {
+            UNITTEST_GE(nodes[i]->context.size(), 1);
+            UNITTEST_EQ(nodes[i]->context.at("ProcessorRange"),
+                        ark::Json({0 * i, 2 * i}));
+        }
+    }
 
     return ark::unittest::SUCCESS;
 }
diff --git a/ark/include/kernels/common/broadcast.h b/ark/include/kernels/common/broadcast.h
index 858938613..86e84e5d0 100644
--- a/ark/include/kernels/common/broadcast.h
+++ b/ark/include/kernels/common/broadcast.h
@@ -400,6 +400,8 @@ struct Broadcast1 {
 
         static constexpr size_t StepSize = NelemPerThread * UnitOp::NumThreads;
 
+        UnitOp::sync_threads();
+
         for (size_t tid = NelemPerThread * UnitOp::thread_id();;
              tid += StepSize) {
             size_t tid_n = tid / UnitOutDims::CHW;
@@ -435,8 +437,6 @@ struct Broadcast1 {
             }
             Intrinsic::compute(&out[idx_out], &in[idx_in]);
         }
-
-        UnitOp::sync_threads();
     }
 };
 
@@ -469,6 +469,8 @@ struct Broadcast2 {
 
         static constexpr size_t StepSize = NelemPerThread * UnitOp::NumThreads;
 
+        UnitOp::sync_threads();
+
         for (size_t tid = NelemPerThread * UnitOp::thread_id();;
              tid += StepSize) {
             size_t tid_n = tid / UnitOutDims::CHW;
@@ -518,8 +520,6 @@ struct Broadcast2 {
             }
             Intrinsic::compute(&out[idx_out], &in0[idx_in0], &in1[idx_in1]);
         }
-
-        UnitOp::sync_threads();
     }
 };
 
diff --git a/ark/include/kernels/common/ewise.h b/ark/include/kernels/common/ewise.h
index de52f4584..c77bb7abf 100644
--- a/ark/include/kernels/common/ewise.h
+++ b/ark/include/kernels/common/ewise.h
@@ -31,6 +31,8 @@ struct Ewise1 {
         int uh = UnitOp::uop_idx_h(uop_idx);
         int uw = UnitOp::uop_idx_w(uop_idx);
 
+        UnitOp::sync_threads();
+
         for (int tid = UnitOp::thread_id();; tid += UnitOp::NumThreads) {
             int tid_w = (tid * NelemPerThread) % UnitOutDims::W;
             int tid_h =
@@ -50,8 +52,6 @@ struct Ewise1 {
 
             CompType::compute(out, in, idx_n, idx_c, idx_h, idx_w);
         }
-
-        UnitOp::sync_threads();
     }
 };
 
diff --git a/ark/include/kernels/gemm_cutlass.h b/ark/include/kernels/gemm_cutlass.h
index ae13e4c5b..e87c7ddd2 100644
--- a/ark/include/kernels/gemm_cutlass.h
+++ b/ark/include/kernels/gemm_cutlass.h
@@ -260,6 +260,8 @@ DEVICE void gemm_cuda(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
         UnitOp::template shared_memory<GemmKernel::SharedStorage>(
             smem_per_warp);
 
+    UnitOp::sync_threads();
+
     GemmKernel gemm_kernel{};
     gemm_kernel(params, *ps);
 }
diff --git a/ark/include/kernels/layernorm.h b/ark/include/kernels/layernorm.h
index b0f101e76..5bc17235d 100644
--- a/ark/include/kernels/layernorm.h
+++ b/ark/include/kernels/layernorm.h
@@ -63,6 +63,8 @@ struct LayerNorm {
                           (tid_c + uc * UnitOutDims::C) * InDims::HW +
                           (tid_n + un * UnitOutDims::N) * InDims::CHW;
 
+        UnitOp::sync_threads();
+
         DataType mean;
         DataType cmp;
         ReduceTypeMean::template identity<1>(&mean);
@@ -108,7 +110,6 @@ struct LayerNorm {
             out[idx_out] = type::Mul::compute(
                 type::Sub::compute(in[idx_in], mean), variance);
         }
-        UnitOp::sync_threads();
     }
 };
 
diff --git a/ark/include/kernels/matmul.h b/ark/include/kernels/matmul.h
index fd6c33d0f..b14f10bf6 100644
--- a/ark/include/kernels/matmul.h
+++ b/ark/include/kernels/matmul.h
@@ -26,17 +26,22 @@ namespace ark {
 /// (m, n, k).
 /// @tparam LeadingDims (ark::Vec) The leading dimensions of matrix inputs
 /// and outputs. (lda, ldc, ldc, ldb).
-/// @tparam InnerLdimA (int) The leading dimension of the inner dimension of A.
-/// @tparam InnerLdimB (int) The leading dimension of the inner dimension of B.
+/// @tparam BatchStrideNA (int)
+/// @tparam BatchStrideCA (int)
+/// @tparam BatchStrideNB (int)
+/// @tparam BatchStrideCB (int)
+/// @tparam BatchStrideNC (int)
+/// @tparam BatchStrideCC (int)
 /// @tparam IsColumnA (bool) Whether matrix A is column-major.
 /// @tparam IsColumnB (bool) Whether matrix B is column-major.
 /// @tparam NumWarps (int) The number of warps per uop.
 /// @tparam SmemBytes (int) The size of shared memory per uop.
 ///
 template <typename OutDims, typename NCA, typename NCB, typename TileShape,
-          typename ProblemSize, typename LeadingDims, int InnerLdimA,
-          int InnerLdimB, bool IsColumnA, bool IsColumnB, int NumWarps,
-          int SmemBytes, typename DataTypeA, typename DataTypeB,
+          typename ProblemSize, typename LeadingDims, int BatchStrideNA,
+          int BatchStrideCA, int BatchStrideNB, int BatchStrideCB,
+          int BatchStrideNC, int BatchStrideCC, bool IsColumnA, bool IsColumnB,
+          int NumWarps, int SmemBytes, typename DataTypeA, typename DataTypeB,
           typename DataTypeC>
 DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
                    int smem_per_warp) {
@@ -67,38 +72,13 @@ DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
     constexpr int TileSizeM = TileShape::D0;
     constexpr int TileSizeN = TileShape::D1;
 
-    constexpr DimType SizeA = math::mul<OutDims::H, InnerLdimA>::value;
-    constexpr DimType SizeB = math::mul<OutDims::W, InnerLdimB>::value;
-    constexpr DimType SizeC = math::mul<OutDims::H, OutDims::W>::value;
-    static_assert(SizeA >= 0, "");
-    static_assert(SizeB >= 0, "");
-    static_assert(SizeC >= 0, "");
-
     int un = UnitOp::uop_idx_n(uop_idx);
     int uc = UnitOp::uop_idx_c(uop_idx);
 
     // Broadcasting
-    DataTypeA *pA;
-    DataTypeB *pB;
-    DataTypeC *pC = &C[un * math::mul<CC, SizeC>::value + uc * SizeC];
-    if constexpr (NCA::D0 == 1 && NCA::D1 == 1) {
-        pA = A;
-    } else if constexpr (NCA::D0 == 1) {
-        pA = &A[uc * SizeA];
-    } else if constexpr (NCA::D1 == 1) {
-        pA = &A[un * SizeA];
-    } else {
-        pA = &A[un * math::mul<CC, SizeA>::value + uc * SizeA];
-    }
-    if constexpr (NCB::D0 == 1 && NCB::D1 == 1) {
-        pB = B;
-    } else if constexpr (NCB::D0 == 1) {
-        pB = &B[uc * SizeB];
-    } else if constexpr (NCB::D1 == 1) {
-        pB = &B[un * SizeB];
-    } else {
-        pB = &B[un * math::mul<CC, SizeB>::value + uc * SizeB];
-    }
+    DataTypeA *pA = &A[un * BatchStrideNA + uc * BatchStrideCA];
+    DataTypeB *pB = &B[un * BatchStrideNB + uc * BatchStrideCB];
+    DataTypeC *pC = &C[un * BatchStrideNC + uc * BatchStrideCC];
 
 #if defined(ARK_TARGET_CUDA_ARCH)
     gemm_cutlass<DataTypeA, LeadingDimA, IsColumnA, DataTypeB, LeadingDimB,
@@ -111,7 +91,6 @@ DEVICE void matmul(DataTypeC *C, DataTypeA *A, DataTypeB *B, int uop_idx,
             ProblemSizeK, TileSizeM, TileSizeN, UnitOp>(pC, pA, pB, uop_idx,
                                                         smem_per_warp);
 #endif
-    UnitOp::sync_threads();
 }
 
 }  // namespace ark
diff --git a/ark/include/kernels/reduce.h b/ark/include/kernels/reduce.h
index 9ebe6555c..62af5840b 100644
--- a/ark/include/kernels/reduce.h
+++ b/ark/include/kernels/reduce.h
@@ -397,6 +397,8 @@ struct WwiseReduce {
 
         DataType reduced[NelemPerThread];
 
+        UnitOp::sync_threads();
+
         ReduceType::template identity<NelemPerThread>(reduced);
         for (int idx_w = tid_w; idx_w < InShape::W; idx_w += ThreadsPerRow) {
             int idx_in = idx_in_base + idx_w;
@@ -438,8 +440,6 @@ struct WwiseReduce {
             ReduceType::template postReduce<1>(&out[idx_out], &reduced[0],
                                                InShape::W);
         }
-
-        UnitOp::sync_threads();
     }
 };
 
diff --git a/ark/ops/ops_matmul.cpp b/ark/ops/ops_matmul.cpp
index ef5f516b0..823bf2656 100644
--- a/ark/ops/ops_matmul.cpp
+++ b/ark/ops/ops_matmul.cpp
@@ -125,6 +125,7 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
     Dims other_shape_dims4 = other->shape().dims4();
     Dims input_dim_nc{input_shape_dims4[0], input_shape_dims4[1]};
     Dims other_dim_nc{other_shape_dims4[0], other_shape_dims4[1]};
+    Dims output_dim_nc = broadcast_shape(input_dim_nc, other_dim_nc);
 
     Dims strides_acdb{
         input->strides().dims4()[-1], output->strides().dims4()[-1],
@@ -156,6 +157,37 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
         inner_stride_b = other->strides().dims4()[-2];
     }
 
+    DimType size_a = inner_stride_a * output->strides()[-2];
+    DimType size_b = inner_stride_b * output->strides()[-1];
+    DimType size_c = output->strides()[-2] * output->strides()[-1];
+    DimType batch_stride_c_a = input_dim_nc[1] == 1 ? 0 : size_a;
+    DimType batch_stride_n_a =
+        input_dim_nc[0] == 1 ? 0 : size_a * input_dim_nc[1];
+    DimType batch_stride_c_b = other_dim_nc[1] == 1 ? 0 : size_b;
+    DimType batch_stride_n_b =
+        other_dim_nc[0] == 1 ? 0 : size_b * other_dim_nc[1];
+    DimType batch_stride_c_c = output_dim_nc[1] == 1 ? 0 : size_c;
+    DimType batch_stride_n_c =
+        output_dim_nc[0] == 1 ? 0 : size_c * output_dim_nc[1];
+    if (config.contains("BatchStrideNA")) {
+        batch_stride_n_a = config["BatchStrideNA"].get<DimType>();
+    }
+    if (config.contains("BatchStrideNB")) {
+        batch_stride_n_b = config["BatchStrideNB"].get<DimType>();
+    }
+    if (config.contains("BatchStrideNC")) {
+        batch_stride_n_c = config["BatchStrideNC"].get<DimType>();
+    }
+    if (config.contains("BatchStrideCA")) {
+        batch_stride_c_a = config["BatchStrideCA"].get<DimType>();
+    }
+    if (config.contains("BatchStrideCB")) {
+        batch_stride_c_b = config["BatchStrideCB"].get<DimType>();
+    }
+    if (config.contains("BatchStrideCC")) {
+        batch_stride_c_c = config["BatchStrideCC"].get<DimType>();
+    }
+
     return function_name_string("matmul",
                                 {
                                     vec_string(output->strides().dims4()),
@@ -164,8 +196,12 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
                                     vec_string(tile_shape),
                                     vec_string(padded_problem_size),
                                     vec_string(strides_acdb),
-                                    std::to_string(inner_stride_a),
-                                    std::to_string(inner_stride_b),
+                                    std::to_string(batch_stride_n_a),
+                                    std::to_string(batch_stride_c_a),
+                                    std::to_string(batch_stride_n_b),
+                                    std::to_string(batch_stride_c_b),
+                                    std::to_string(batch_stride_n_c),
+                                    std::to_string(batch_stride_c_c),
                                     std::to_string(trans_input),
                                     std::to_string(trans_other),
                                     std::to_string(num_warps),
@@ -173,8 +209,8 @@ std::string ModelOpMatmul::impl_name(const Json &config) const {
                                 });
 }
 
-std::vector<ModelOpArg> ModelOpMatmul::impl_args([
-    [maybe_unused]] const Json &config) const {
+std::vector<ModelOpArg> ModelOpMatmul::impl_args(
+    [[maybe_unused]] const Json &config) const {
     return {result_tensors_[0], read_tensors_[0], read_tensors_[1]};
 }
 
diff --git a/examples/llama/model.py b/examples/llama/model.py
index b69bcf2f4..e81f239f2 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -91,7 +91,6 @@ def __init__(
 
     def forward(self, x):
         with Context(
-            warp_range=[0, 8],
             sync=False,
             config={
                 "NumWarps": 1,
@@ -336,33 +335,56 @@ def __init__(
             dim, hidden_dim, dtype, False, local_rank, world_size
         )
 
-    def forward(self, x):
+    def forward(self, x, ffn_norm):
+        h = ffn_norm(x)
         with Context(
-            warp_range=[0, 8],
+            processor_range=[0, 304],
             sram_range=[0, 49344],
-            sync=False,
-            config={
-                "NumWarps": 4,
-            },
+            config={"NumWarps": 4},
         ):
-            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                x1 = self.w1(x)
-            with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
-                x1 = Silu()(x1)
+            out_shape = h.shape()
+            out_shape[-1] = self.w1.out_dim
+            out = ark.tensor(out_shape, h.dtype())
+            pos = 0
+            for dim, tile, sram in [
+                [1792, [256, 128], 24672],
+                [256, [128, 128], 16480],
+            ]:
+                with Context(
+                    processor_range=[0, 304], sync=False, config={"Tile": tile}
+                ):
+                    h_shard = h[:, pos : pos + dim, :]
+                    out_shard = out[:, pos : pos + dim, :]
+                    with Context(config={"SramBytes": sram}):
+                        x1 = ark.matmul(
+                            h_shard, self.w1.weight, transpose_other=True
+                        )
+                    with Context(config={"SramBytes": 0}):
+                        x1 = Silu()(x1)
+                # We don't need a barrier here but somehow the performance is better with it
+                with Context(
+                    processor_range=[0, 304], sync=False, config={"Tile": tile}
+                ):
+                    with Context(config={"SramBytes": sram}):
+                        x2 = ark.matmul(
+                            h_shard, self.w3.weight, transpose_other=True
+                        )
+                    with Context(config={"SramBytes": 0}):
+                        x3 = ark.mul(x1, x2, out_shard)
+                    out = ark.identity(out, deps=[x3])
+                    pos += dim
+
         with Context(
-            warp_range=[0, 8],
-            sram_range=[0, 49344],
-            sync=False,
+            warp_range=[0, 4],
             config={
                 "NumWarps": 4,
+                "Tile": [256, 128],
+                "SramBytes": 24672,
             },
+            sync=False,
         ):
-            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                x2 = self.w3(x)
-            with Context(config={"SramBytes": 0, "Tile": [256, 128]}):
-                x3 = ark.mul(x1, x2)
-        x4 = self.w2(x3)
-        return x4
+            ff = self.w2(out)
+            return ark.add(x, ff)
 
 
 def apply_rotary_emb(xq, xk, freqs_cis):
@@ -431,9 +453,31 @@ def forward(
         start_pos: int,
         freqs_cis: ark.Tensor,
         mask: Optional[ark.Tensor],
+        attention_norm,
     ):
         bsz, seqlen, _ = x.shape()
 
+        x_norm = attention_norm(x)
+
+        xq_scratch = ark.tensor(
+            [
+                bsz,
+                seqlen * self.n_local_heads,
+                self.n_local_heads,
+                self.head_dim,
+            ],
+            self.dtype,
+        )
+        xk_scratch = ark.tensor(
+            [
+                bsz,
+                seqlen * self.n_local_kv_heads,
+                self.n_local_kv_heads,
+                self.head_dim,
+            ],
+            self.dtype,
+        )
+
         with Context(
             warp_range=[0, 4],
             sram_range=[0, 24672],
@@ -441,13 +485,15 @@ def forward(
             config={"NumWarps": 4},
         ):
             with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xq = self.wq(x)
+                xq = ark.matmul(x_norm, self.wq.weight, transpose_other=True)
             xq = ark.reshape(
                 xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
             )
             with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
                 if freqs_cis is not None:
-                    xq = ark.rope(xq, freqs_cis)
+                    xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
+
+            xq_scratch = ark.identity(xq_scratch, deps=[xq])
 
         with Context(
             warp_range=[0, 4],
@@ -456,13 +502,15 @@ def forward(
             config={"NumWarps": 4},
         ):
             with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xk = self.wk(x)
+                xk = ark.matmul(x_norm, self.wk.weight, transpose_other=True)
             xk = ark.reshape(
                 xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
             with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
                 if freqs_cis is not None:
-                    xk = ark.rope(xk, freqs_cis)
+                    xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
+
+            xk_scratch = ark.identity(xk_scratch, deps=[xk])
 
         with Context(
             warp_range=[0, 4],
@@ -471,47 +519,44 @@ def forward(
             config={"NumWarps": 4},
         ):
             with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xv = self.wv(x)
+                xv = ark.matmul(x_norm, self.wv.weight, transpose_other=True)
             xv = ark.reshape(
                 xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
 
-        # TODO: enable kv cache later
-        keys = xk
-        values = xv
-
-        xq_shards = ark.sharding(xq, axis=2, dim_per_shard=1)
-        keys_shards = ark.sharding(keys, axis=2, dim_per_shard=1)
-        scores = ark.tensor([bsz, self.n_local_heads, seqlen, seqlen], dtype=self.dtype)
-        scores_shards = ark.sharding(scores, axis=1, dim_per_shard=1)
-        results = []
-        with Context(processor_range=[0, 304]):
-            for i in range(len(scores_shards)):
-                with Context(
-                    processor_range=[i*8, (i+1)*8],
-                    warp_range=[0, 8],
-                    sram_range=[0, 49344],
-                    sync=False,
-                    config={
-                        "NumWarps": 4,
-                        "Granularity": 2,
-                        "SramBytes": 24672,
-                        "Tile": [256, 128],
-                    },
-                ):
-                    xq_shard_reshaped = ark.reshape(xq_shards[i], [bsz, 1, seqlen, self.head_dim])
-                    keys_shard_reshaped = ark.reshape(keys_shards[i], [bsz, 1, seqlen, self.head_dim])
-                    scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
-                    res = ark.matmul(xq_shard_reshaped, keys_shard_reshaped, scores_shard_reshaped, transpose_other=True)
-                    res = ark.mul(res, 1.0 / math.sqrt(self.head_dim), res)
-                    if mask is not None:
-                        res = ark.add(res, mask, res)
-                results.append(res)
-        scores = ark.identity(scores, deps=results)
+        def calc_scores(xq_scratch, xk_scratch, mask):
+            xq = xq_scratch[:, :, 0, :]
+            xk = xk_scratch[:, :, 0, :]
+            xq = ark.reshape(
+                xq, [bsz, self.n_local_heads, seqlen, self.head_dim]
+            )
+            xk = ark.reshape(
+                xk, [bsz, self.n_local_kv_heads, seqlen, self.head_dim]
+            )
+            with Context(
+                sync=False,
+                config={
+                    "Tile": [256, 128],
+                    "SramBytes": 24672,
+                    "NumWarps": 4,
+                    "BatchStrideCA": self.head_dim,
+                    "BatchStrideNA": (
+                        self.n_local_heads * seqlen * self.head_dim
+                    ),
+                    "BatchStrideCB": self.head_dim,
+                    "BatchStrideNB": (
+                        self.n_local_kv_heads * seqlen * self.head_dim
+                    ),
+                },
+            ):
+                scores = ark.matmul(xq, xk, transpose_other=True)
+                scores = ark.mul(scores, 1.0 / math.sqrt(self.head_dim), scores)
+                if mask is not None:
+                    scores = ark.add(scores, mask, scores)
+            return scores
 
         def softmax(scores):
             with Context(
-                warp_range=[0, 8],
                 sram_range=[0, 0],
                 sync=False,
                 config={
@@ -522,45 +567,61 @@ def softmax(scores):
                 with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
                     max = ark.reduce_max(scores, axis=-1)
                 with Context(config={"Tile": [1, 2048]}):
-                    output = ark.sub(scores, max)
-                    output = ark.exp(output)
+                    tmp = ark.sub(scores, max)
+                    tmp = ark.exp(tmp)
                 with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
-                    sum = ark.reduce_sum(output, axis=-1)
+                    sum = ark.reduce_sum(tmp, axis=-1)
                 with Context(config={"Tile": [1, 2048]}):
-                    output = ark.div(output, sum)
+                    output = ark.div(tmp, sum)
             return output
 
+        scores = calc_scores(xq_scratch, xk_scratch, mask)
         scores = softmax(scores)
 
-        scores_shards = ark.sharding(scores, axis=1, dim_per_shard=1)
-        values_shards = ark.sharding(values, axis=2, dim_per_shard=1)
-        output = ark.tensor([bsz, seqlen, self.n_local_heads, self.head_dim], dtype=self.dtype)
-        output_shards = ark.sharding(output, axis=2, dim_per_shard=1)
+        output_scratch = ark.tensor(
+            [
+                bsz,
+                seqlen * self.n_local_heads,
+                self.n_local_heads,
+                self.head_dim,
+            ],
+            dtype=self.dtype,
+        )
+        with Context(
+            sync=False,
+            config={
+                "Tile": [256, 128],
+                "SramBytes": 24672,
+                "NumWarps": 4,
+                "BatchStrideCB": self.head_dim,
+                "BatchStrideNB": self.n_local_kv_heads * seqlen * self.head_dim,
+                "BatchStrideCC": self.head_dim,
+                "BatchStrideNC": self.n_local_kv_heads * seqlen * self.head_dim,
+            },
+        ):
+            xv = ark.reshape(xv[:, :, 0, :], [bsz, 1, seqlen, self.head_dim])
+            output = ark.reshape(
+                output_scratch[:, :, 0, :],
+                [bsz, self.n_local_heads, seqlen, self.head_dim],
+            )
+            output = ark.matmul(scores, xv, output)
+            output = ark.identity(
+                output_scratch[:, :seqlen, :, :], deps=[output]
+            )
 
-        results = []
-        with Context(processor_range=[0, 304]):
-            for i in range(len(output_shards)):
-                with Context(
-                    processor_range=[i*8, (i+1)*8],
-                    warp_range=[0, 4],
-                    sram_range=[0, 24672],
-                    sync=False,
-                    config={
-                        "NumWarps": 4,
-                        "SramBytes": 24672,
-                        "Tile": [256, 128],
-                    },
-                ):
-                    values_shard_reshaped = ark.reshape(values_shards[i], [bsz, 1, seqlen, self.head_dim])
-                    scores_shard_reshaped = ark.reshape(scores_shards[i], [bsz, 1, seqlen, seqlen])
-                    output_shard_reshaped = ark.reshape(output_shards[i], [bsz, 1, seqlen, self.head_dim])
-                    res = ark.matmul(scores_shard_reshaped, values_shard_reshaped, output_shard_reshaped)
-                    results.append(res)
-            output = ark.identity(output, deps=results)
         output = ark.reshape(
             output, [bsz, seqlen, self.head_dim * self.n_local_heads]
         )
-        return self.wo(output)
+        with Context(
+            config={
+                "NumWarps": 4,
+                "Tile": [256, 128],
+                "SramBytes": 24672,
+            },
+            sync=False,
+        ):
+            output = self.wo(output)
+            return ark.add(x, output)
 
 
 class TransformerBlock(ark.Module):
@@ -597,28 +658,10 @@ def forward(
         freqs_cis: ark.Tensor,
         mask: Optional[ark.Tensor],
     ):
-        attention_norm_x = self.attention_norm(x)
-        h = self.attention.forward(attention_norm_x, start_pos, freqs_cis, mask)
-        with Context(
-            warp_range=[0, 4],
-            config={
-                "NumWarps": 4,
-                "Tile": [256, 128],
-                "SramBytes": 0,
-            },
-        ):
-            h = ark.add(x, h)
-        ff = self.feed_forward(self.ffn_norm(h))
-        with Context(
-            warp_range=[0, 4],
-            config={
-                "NumWarps": 4,
-                "Tile": [256, 128],
-                "SramBytes": 0,
-            },
-        ):
-            out = ark.add(h, ff)
-        return out
+        h = self.attention.forward(
+            x, start_pos, freqs_cis, mask, self.attention_norm
+        )
+        return self.feed_forward(h, self.ffn_norm)
 
 
 class Transformer(ark.Module):
@@ -658,11 +701,14 @@ def forward(
         freqs_cis: ark.Tensor,
         mask: Optional[ark.Tensor],
     ):
-        with Context(warp_range=[0, 8]):
+        with Context(warp_range=[0, 8], sram_range=[0, 49344]):
             h = self.tok_embeddings(tokens)
 
             for layer in self.layers:
                 h = layer(h, start_pos, freqs_cis, mask)
             h = self.norm(h)
-            output = self.output(h)
+            with Context(
+                config={"Tile": [256, 128], "SramBytes": 24672, "NumWarps": 4}
+            ):
+                output = self.output(h)
             return output
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index a0850f3ad..2ed2d0e63 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -406,68 +406,6 @@ def test_attention(
     )
 
 
-def test_transformer_block(
-    args: ModelArgs,
-    batch_size: int,
-    seq_len: int,
-    dtype: np.dtype,
-    rank: int = 0,
-    world_size: int = 1,
-):
-    #
-    freqs_cis = precompute_freqs_cis(
-        args.dim // args.n_heads, args.max_seq_len * 2
-    )[0:seq_len]
-
-    freqs_cis_ark = freqs_cis.astype(np.complex64)
-    freqs_cis_ark = (
-        np.stack([freqs_cis_ark.real, freqs_cis_ark.imag], axis=-1)
-        .astype(dtype)
-        .reshape(1, seq_len, 1, args.dim // args.n_heads)
-    )
-
-    feature = np.random.uniform(
-        low=-1, high=1, size=(batch_size, seq_len, args.dim)
-    ).astype(dtype)
-
-    module = model_ark.Attention(
-        args, ark.DataType.from_numpy(dtype), rank, world_size
-    )
-    # module_inputs = [
-    #     ark.tensor(list(i.shape), ark.DataType.from_numpy(i.dtype))
-    #     if isinstance(i, np.ndarray)
-    #     else i
-    #     for i in inputs
-    # ]
-    feature_tensor = ark.tensor(
-        list(feature.shape), ark.DataType.from_numpy(feature.dtype)
-    )
-    freqs_cis_ark_tensor = ark.tensor(
-        list(freqs_cis_ark.shape), ark.DataType.from_numpy(freqs_cis_ark.dtype)
-    )
-    output = module(feature_tensor, 0, freqs_cis_ark_tensor, None)
-
-    print(ark.Model.get_model().serialize())
-
-    # test_module(
-    #     module_class_ark=model_ark.TransformerBlock,
-    #     module_args_ark=[
-    #         0,
-    #         args,
-    #         ark.DataType.from_numpy(dtype),
-    #         rank,
-    #         world_size,
-    #     ],
-    #     inputs_ark=[feature, 0, freqs_cis_ark, None],
-    #     module_class_pt=model_pt.TransformerBlock,
-    #     module_args_pt=[0, args],
-    #     inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-    #     module_name_prefix="layers.0",
-    #     rank=rank,
-    #     world_size=world_size,
-    # )
-
-
 def test_transformer(
     args: ModelArgs,
     batch_size: int,
@@ -535,7 +473,6 @@ def test(args, batch_size, seq_len, dtype, rank, world_size):
     # test_row_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_column_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_attention(args, batch_size, seq_len, dtype, rank, world_size)
-    # test_transformer_block(args, batch_size, seq_len, dtype, rank, world_size)
     test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
 
 
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 70903125d..46145035a 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -13,6 +13,8 @@
 __all__ = [
     "tensor",
     "parameter",
+    "placeholder",
+    "noop",
     "reshape",
     "identity",
     "sharding",
diff --git a/python/ark/planner.py b/python/ark/planner.py
index 4b2b7b919..0ed9113e1 100644
--- a/python/ark/planner.py
+++ b/python/ark/planner.py
@@ -5,6 +5,7 @@
 import json
 from typing import Callable, Dict, List, Any
 
+from . import error
 from .core import CorePlanner, CorePlannerContext
 from .model import Model
 
@@ -155,13 +156,27 @@ def processor_groups(self) -> List[Dict[str, Any]]:
 
     @staticmethod
     def from_str(plan_str: str) -> "Plan":
-        plan = json.loads(plan_str)
+        try:
+            plan = json.loads(plan_str)
+        except json.JSONDecodeError:
+            raise error.InvalidUsageError(
+                "Plan string is not a valid JSON string."
+            )
         return Plan(plan)
 
     @staticmethod
     def from_file(file_path: str) -> "Plan":
-        with open(file_path, "r") as f:
-            plan = json.load(f)
+        try:
+            with open(file_path, "r") as f:
+                plan = json.load(f)
+        except FileNotFoundError:
+            raise error.InvalidUsageError(
+                f"Plan file {file_path} does not exist."
+            )
+        except json.JSONDecodeError:
+            raise error.InvalidUsageError(
+                f"Plan file {file_path} is not a valid JSON file."
+            )
         return Plan(plan)
 
 
diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index f3ed55042..5d7ab8366 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -9,9 +9,10 @@
 from .planner import Plan
 
 
-def timeit(plan: Plan, iter: int, loop_mode: bool):
+def timeit(plan: Plan, iter: int, loop_mode: bool, warmup: int = 3):
     with Runtime() as rt:
         rt.launch(plan=plan, loop_mode=loop_mode)
+        rt.run(iter=warmup)
         start_time = time.time()
         rt.run(iter=iter)
         end_time = time.time()
@@ -47,7 +48,10 @@ def run(
             "ProcessorGroups": [None],
         }
         for i in range(num_processor_groups):
-            if target_processor_groups is not None and i not in target_processor_groups:
+            if (
+                target_processor_groups is not None
+                and i not in target_processor_groups
+            ):
                 continue
             new_plan["ProcessorGroups"][0] = self.plan.processor_groups[i]
             lat_per_iter = timeit(Plan(new_plan), iter, loop_mode)
@@ -81,12 +85,16 @@ def run(
         type=str,
         help="Target processor groups to profile",
     )
-    parser.add_argument("--plan", type=str, help="Path to the plan file", required=True)
+    parser.add_argument(
+        "--plan", type=str, help="Path to the plan file", required=True
+    )
     args = parser.parse_args()
 
     target_processor_groups = None
     if args.target_processor_groups is not None:
-        target_processor_groups = list(map(int, args.target_processor_groups.split(",")))
+        target_processor_groups = list(
+            map(int, args.target_processor_groups.split(","))
+        )
 
     plan = Plan.from_file(args.plan)
     profiler = Profiler(plan)
diff --git a/python/ark/tensor.py b/python/ark/tensor.py
index 2ef77ce0e..216318b27 100644
--- a/python/ark/tensor.py
+++ b/python/ark/tensor.py
@@ -35,9 +35,9 @@ def __init__(
             initializer (Initializer): The initializer for the Tensor.
             requires_grad (bool): Whether the tensor requires gradient. Defaults to True.
         """
-        self._tensor = _tensor
+        self._tensor: CoreTensor = _tensor
         self.initializer: Initializer = initializer
-        self.requires_grad = requires_grad
+        self.requires_grad: bool = requires_grad
 
     def __hash__(self):
         return self._tensor.id()
@@ -47,6 +47,69 @@ def __eq__(self, other):
             return False
         return self._tensor.id() == other._tensor.id()
 
+    def __getitem__(self, index) -> "Tensor":
+        if not isinstance(index, tuple):
+            index = (index,)
+        new_shape = []
+        new_strides = []
+        new_offsets = []
+        new_padded_shape = []
+        if len(index) > len(self.shape()):
+            raise log.InvalidUsageError(
+                f"Index has more dimensions than the tensor. Index: "
+                f"{index}, tensor shape: {self.shape()}"
+            )
+        for i, idx in enumerate(index):
+            shape_len = self.shape()[i]
+            padded_shape_len = self._padded_shape()[i]
+            pad_len = padded_shape_len - shape_len
+            if isinstance(idx, int):
+                new_shape.append(1)
+                new_strides.append(self.strides()[i])
+                new_offsets.append(idx)
+                if idx == shape_len - 1:
+                    new_padded_shape.append(1 + pad_len)
+                else:
+                    new_padded_shape.append(1)
+            elif isinstance(idx, slice):
+                start = idx.start or 0
+                stop = idx.stop or self.shape()[i]
+                step = idx.step or 1
+                if step < 0:
+                    start, stop = stop + 1, start + 1
+                if step != 1 and step != -1:
+                    # TODO: support step other than 1 or -1
+                    raise log.UnsupportedError(
+                        f"Step must be 1 or -1. Given: {step}"
+                    )
+                new_shape.append(stop - start)
+                new_strides.append(self.strides()[i])
+                new_offsets.append(start)
+                if stop == shape_len:
+                    new_padded_shape.append(stop + pad_len - start)
+                else:
+                    new_padded_shape.append(stop - start)
+            else:
+                raise log.InvalidUsageError(
+                    f"Index must be an integer or a slice. Index: {idx}"
+                )
+        new_shape = Dims(new_shape)
+        new_strides = Dims(new_strides)
+        new_offsets = Dims(new_offsets)
+        new_padded_shape = Dims(new_padded_shape)
+        new_tensor = Tensor(
+            Model.get_model().refer(
+                self._tensor,
+                new_shape,
+                new_strides,
+                new_offsets,
+                new_padded_shape,
+                "",
+            )
+        )
+        new_tensor.requires_grad = self.requires_grad
+        return new_tensor
+
     @classmethod
     def __torch_function__(cls, func, types, args=(), kwargs=None):
         if kwargs is None:
@@ -65,6 +128,12 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
                 new_kwargs[key] = value
         return func(*new_args, **new_kwargs)
 
+    def _padded_shape(self) -> List[int]:
+        """
+        Returns the padded shape of the tensor.
+        """
+        return self._tensor.padded_shape().vector()
+
     def shape(self) -> List[int]:
         """
         Returns the shape of the tensor.
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 2351bf13e..6568f3a5c 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -96,6 +96,9 @@ void register_model(py::module &m) {
         .def("reduce_sum", &ark::Model::reduce_sum, py::arg("input"),
              py::arg("axis"), py::arg("keepdims"), py::arg("output"),
              py::arg("name"))
+        .def("refer", &ark::Model::refer, py::arg("input"), py::arg("shape"),
+             py::arg("strides"), py::arg("offsets"), py::arg("padded_shape"),
+             py::arg("name"))
         .def("relu", &ark::Model::relu, py::arg("input"), py::arg("output"),
              py::arg("name"))
         .def("reshape", &ark::Model::reshape, py::arg("input"),
diff --git a/python/unittest/test_tensor.py b/python/unittest/test_tensor.py
index 799c1f60f..c8be143f0 100644
--- a/python/unittest/test_tensor.py
+++ b/python/unittest/test_tensor.py
@@ -2,6 +2,26 @@
 # Licensed under the MIT license.
 
 from common import ark, pytest_ark
+import numpy as np
+
+
+@pytest_ark()
+def test_tensor_slice():
+    t0 = ark.ones([4, 64], ark.fp16)
+    t1 = t0[2:, :]
+    ark.noop(t1)
+
+    assert t1.shape() == [2, 64]
+    assert t1.dtype() == ark.fp16
+    assert t1.strides() == [4, 64]
+
+    with ark.Runtime() as rt:
+        rt.launch()
+        rt.run()
+
+        x = t1.to_numpy()
+
+    assert np.allclose(x, np.ones([2, 64], np.float16))
 
 
 @pytest_ark(need_torch=True)

From 6b5550e3cd1b5347c3e040458bbfb19ec9affd59 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 06:26:22 +0000
Subject: [PATCH 096/106] update

---
 examples/llama/model.py | 264 +++++++++++++++++++++++++++-------------
 1 file changed, 182 insertions(+), 82 deletions(-)

diff --git a/examples/llama/model.py b/examples/llama/model.py
index e81f239f2..fa52a522e 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -11,6 +11,11 @@
 from typing import Optional
 from ark import PlannerContext as Context
 
+NUM_SM = 304
+NUM_WARPS_PER_SM = 8
+NUM_WARPS = NUM_SM * NUM_WARPS_PER_SM
+WARP_SIZE = 64
+SRAM_PER_SM = 65536
 
 @dataclass
 class ModelArgs:
@@ -88,6 +93,7 @@ def __init__(
         self.eps = eps
         self.dtype = dtype
         self.weight = ark.parameter([1, 1, dim], ark.fp32)
+        self.dim = dim
 
     def forward(self, x):
         with Context(
@@ -98,14 +104,14 @@ def forward(self, x):
                 "Granularity": 7,
             },
         ):
-            with Context(config={"Tile": [1, 4096]}):
+            with Context(config={"Tile": [self.dim]}):
                 x = ark.cast(x, ark.fp32)
                 x2 = ark.mul(x, x)
             with Context(config={"Tile": [1], "ImplType": "WarpWise"}):
                 mean = ark.reduce_mean(x2, axis=-1)
                 mean = ark.add(mean, self.eps)
                 rrms = ark.rsqrt(mean)
-            with Context(config={"Tile": [1, 4096]}):
+            with Context(config={"Tile": [self.dim]}):
                 x = ark.mul(x, rrms)
                 x = ark.mul(x, self.weight, x)
                 return ark.cast(x, self.dtype)
@@ -248,9 +254,51 @@ def __init__(
         self.world_size = world_size
         self.local_rank = local_rank
 
-    def forward(self, x):
+    def forward(self, x: ark.Tensor):
         if self.world_size == 1:
-            return ark.embedding(x, self.weight)
+            config = {"SramBytes": 0}
+            num_vecs = x.nelems()
+            if num_vecs >= NUM_WARPS:
+                config.update({"NumWarps": 1, "Tile": [self.dim]})
+                num_parts = 1
+            else:
+                min_elem_per_warp = WARP_SIZE * 2
+                max_warps_per_vec = (self.dim + min_elem_per_warp - 1) // min_elem_per_warp
+                warps_per_vec = min(max_warps_per_vec, NUM_WARPS // num_vecs)
+                if warps_per_vec <= NUM_WARPS_PER_SM:
+                    config.update({"NumWarps": warps_per_vec, "Tile": [self.dim]})
+                    num_parts = 1
+                else:
+                    num_parts = warps_per_vec // NUM_WARPS_PER_SM
+                    max_num_parts = 4
+                    assert NUM_SM % max_num_parts == 0
+                    assert 2 ** (max_num_parts.bit_length() - 1) == max_num_parts
+                    if num_parts > max_num_parts:
+                        num_parts = max_num_parts
+                    # make it max power of 2 smaller than num_parts
+                    num_parts = 2 ** (num_parts.bit_length() - 1)
+                    config.update(
+                        {
+                            "NumWarps": NUM_WARPS_PER_SM,
+                            "Tile": [self.dim // num_parts],
+                        }
+                    )
+            with Context(processor_range=[0, NUM_SM], config=config):
+                if num_parts == 1:
+                    return ark.embedding(x, self.weight)
+                emb_output = ark.tensor([x.shape()[0], x.shape()[1], self.dim], self.dtype)
+                emb_parts = []
+                dim_per_part = self.dim // num_parts
+                for i in range(num_parts):
+                    with Context(processor_range=[i * NUM_SM // num_parts, (i + 1) * NUM_SM // num_parts]):
+                        emb_parts.append(
+                            ark.embedding(
+                                x,
+                                self.weight[:, (i * dim_per_part) : ((i + 1) * dim_per_part)],
+                                emb_output[:, :, (i * dim_per_part) : ((i + 1) * dim_per_part)],
+                            )
+                        )
+                return ark.identity(emb_output, deps=emb_parts)
 
         output_tensor = ark.tensor(
             [x.shape()[0], x.shape()[1], self.out_dim], self.dtype
@@ -274,22 +322,6 @@ def forward(self, x):
         )
 
 
-class Linear(ark.Module):
-    """
-    Linear layer module with weights and no bias.
-    """
-
-    def __init__(
-        self, in_dim: int, out_dim: int, dtype: ark.DataType = ark.fp16
-    ):
-        super().__init__()
-        self.dtype = dtype
-        self.weight = ark.parameter([out_dim, in_dim], dtype)
-
-    def forward(self, x):
-        return ark.matmul(x, self.weight, transpose_other=True)
-
-
 class Silu(ark.Module):
     """
     Silu activation function, silu(x) = x * sigmoid(x)
@@ -324,6 +356,7 @@ def __init__(
         hidden_dim = multiple_of * (
             (hidden_dim + multiple_of - 1) // multiple_of
         )
+        self.hidden_dim = hidden_dim
 
         self.w1 = ColumnParallelLinear(
             dim, hidden_dim, dtype, False, local_rank, world_size
@@ -337,21 +370,31 @@ def __init__(
 
     def forward(self, x, ffn_norm):
         h = ffn_norm(x)
+
+        seqlen = h.shape()[1]
+        schedule = None
+        if seqlen == 2048:
+            schedule = [
+                [1792, [256, 128], 24672],
+                [256, [128, 128], 16480],
+            ]
+        elif seqlen == 128:
+            schedule = [
+                [128, [128, 64], 16480],
+            ]
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
+
         with Context(
-            processor_range=[0, 304],
-            sram_range=[0, 49344],
-            config={"NumWarps": 4},
+            processor_range=[0, NUM_SM], config={"NumWarps": 4},
         ):
             out_shape = h.shape()
             out_shape[-1] = self.w1.out_dim
             out = ark.tensor(out_shape, h.dtype())
             pos = 0
-            for dim, tile, sram in [
-                [1792, [256, 128], 24672],
-                [256, [128, 128], 16480],
-            ]:
+            for dim, tile, sram in schedule:
                 with Context(
-                    processor_range=[0, 304], sync=False, config={"Tile": tile}
+                    processor_range=[0, NUM_SM], sync=False, config={"Tile": tile}
                 ):
                     h_shard = h[:, pos : pos + dim, :]
                     out_shard = out[:, pos : pos + dim, :]
@@ -363,7 +406,7 @@ def forward(self, x, ffn_norm):
                         x1 = Silu()(x1)
                 # We don't need a barrier here but somehow the performance is better with it
                 with Context(
-                    processor_range=[0, 304], sync=False, config={"Tile": tile}
+                    processor_range=[0, NUM_SM], sync=False, config={"Tile": tile}
                 ):
                     with Context(config={"SramBytes": sram}):
                         x2 = ark.matmul(
@@ -374,12 +417,21 @@ def forward(self, x, ffn_norm):
                     out = ark.identity(out, deps=[x3])
                     pos += dim
 
+        if seqlen == 2048:
+            tile = [256, 128]
+            sram = 24672
+        elif seqlen == 128:
+            tile = [128, 64]
+            sram = 16480
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
+
         with Context(
             warp_range=[0, 4],
             config={
                 "NumWarps": 4,
-                "Tile": [256, 128],
-                "SramBytes": 24672,
+                "Tile": tile,
+                "SramBytes": sram,
             },
             sync=False,
         ):
@@ -410,6 +462,7 @@ def __init__(
         )
         model_parallel_size = world_size
         self.dtype = dtype
+        self.args = args
         self.n_local_heads = args.n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
@@ -478,51 +531,60 @@ def forward(
             self.dtype,
         )
 
-        with Context(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={"NumWarps": 4},
-        ):
-            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xq = ark.matmul(x_norm, self.wq.weight, transpose_other=True)
-            xq = ark.reshape(
-                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
-            )
-            with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
-                if freqs_cis is not None:
-                    xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
+        if seqlen == 2048:
+            tile = [256, 128]
+            sram = 24672
+        elif seqlen == 128:
+            tile = [128, 64]
+            sram = 16480
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
 
-            xq_scratch = ark.identity(xq_scratch, deps=[xq])
+        num_tile = seqlen * self.args.dim // tile[0] // tile[1]
 
-        with Context(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={"NumWarps": 4},
-        ):
-            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xk = ark.matmul(x_norm, self.wk.weight, transpose_other=True)
-            xk = ark.reshape(
-                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-            )
-            with Context(config={"SramBytes": 0, "Tile": [256, 1, 128]}):
-                if freqs_cis is not None:
-                    xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
+        with Context(processor_range=[0, NUM_SM]):
+            with Context(
+                processor_range=[0, NUM_SM // 3],
+                config={"NumWarps": 4},
+                sync=False,
+            ):
+                with Context(config={"SramBytes": sram, "Tile": tile}):
+                    xq = ark.matmul(x_norm, self.wq.weight, transpose_other=True)
+                xq = ark.reshape(
+                    xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+                )
+                with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+                    if freqs_cis is not None:
+                        xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
 
-            xk_scratch = ark.identity(xk_scratch, deps=[xk])
+                xq_scratch = ark.identity(xq_scratch, deps=[xq])
 
-        with Context(
-            warp_range=[0, 4],
-            sram_range=[0, 24672],
-            sync=False,
-            config={"NumWarps": 4},
-        ):
-            with Context(config={"SramBytes": 24672, "Tile": [256, 128]}):
-                xv = ark.matmul(x_norm, self.wv.weight, transpose_other=True)
-            xv = ark.reshape(
-                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-            )
+            with Context(
+                processor_range=[NUM_SM // 3, 2 * NUM_SM // 3],
+                config={"NumWarps": 4},
+                sync=False,
+            ):
+                with Context(config={"SramBytes": sram, "Tile": tile}):
+                    xk = ark.matmul(x_norm, self.wk.weight, transpose_other=True)
+                xk = ark.reshape(
+                    xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+                )
+                with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+                    if freqs_cis is not None:
+                        xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
+
+                xk_scratch = ark.identity(xk_scratch, deps=[xk])
+
+            with Context(
+                processor_range=[2 * NUM_SM // 3, NUM_SM],
+                config={"NumWarps": 4},
+                sync=False,
+            ):
+                with Context(config={"SramBytes": sram, "Tile": tile}):
+                    xv = ark.matmul(x_norm, self.wv.weight, transpose_other=True)
+                xv = ark.reshape(
+                    xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+                )
 
         def calc_scores(xq_scratch, xk_scratch, mask):
             xq = xq_scratch[:, :, 0, :]
@@ -533,11 +595,20 @@ def calc_scores(xq_scratch, xk_scratch, mask):
             xk = ark.reshape(
                 xk, [bsz, self.n_local_kv_heads, seqlen, self.head_dim]
             )
+            if seqlen == 2048:
+                tile = [256, 128]
+                sram = 24672
+            elif seqlen == 128:
+                tile = [128, 128]
+                sram = 16480
+            else:
+                raise ValueError(f"Unsupported seqlen {seqlen}")
+
             with Context(
                 sync=False,
                 config={
-                    "Tile": [256, 128],
-                    "SramBytes": 24672,
+                    "Tile": tile,
+                    "SramBytes": sram,
                     "NumWarps": 4,
                     "BatchStrideCA": self.head_dim,
                     "BatchStrideNA": (
@@ -566,12 +637,12 @@ def softmax(scores):
             ):
                 with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
                     max = ark.reduce_max(scores, axis=-1)
-                with Context(config={"Tile": [1, 2048]}):
+                with Context(config={"Tile": [seqlen]}):
                     tmp = ark.sub(scores, max)
                     tmp = ark.exp(tmp)
                 with Context(config={"ImplType": "WarpWise", "Tile": [1]}):
                     sum = ark.reduce_sum(tmp, axis=-1)
-                with Context(config={"Tile": [1, 2048]}):
+                with Context(config={"Tile": [seqlen]}):
                     output = ark.div(tmp, sum)
             return output
 
@@ -587,11 +658,20 @@ def softmax(scores):
             ],
             dtype=self.dtype,
         )
+        if seqlen == 2048:
+            tile = [256, 128]
+            sram = 24672
+        elif seqlen == 128:
+            tile = [128, 128]
+            sram = 16480
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
+
         with Context(
             sync=False,
             config={
-                "Tile": [256, 128],
-                "SramBytes": 24672,
+                "Tile": tile,
+                "SramBytes": sram,
                 "NumWarps": 4,
                 "BatchStrideCB": self.head_dim,
                 "BatchStrideNB": self.n_local_kv_heads * seqlen * self.head_dim,
@@ -612,11 +692,20 @@ def softmax(scores):
         output = ark.reshape(
             output, [bsz, seqlen, self.head_dim * self.n_local_heads]
         )
+        if seqlen == 2048:
+            tile = [256, 128]
+            sram = 24672
+        elif seqlen == 128:
+            tile = [128, 128]
+            sram = 16480
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
+
         with Context(
             config={
                 "NumWarps": 4,
-                "Tile": [256, 128],
-                "SramBytes": 24672,
+                "Tile": tile,
+                "SramBytes": sram,
             },
             sync=False,
         ):
@@ -701,14 +790,25 @@ def forward(
         freqs_cis: ark.Tensor,
         mask: Optional[ark.Tensor],
     ):
-        with Context(warp_range=[0, 8], sram_range=[0, 49344]):
+        with Context(warp_range=[0, NUM_WARPS_PER_SM], sram_range=[0, 49344]):
             h = self.tok_embeddings(tokens)
 
             for layer in self.layers:
                 h = layer(h, start_pos, freqs_cis, mask)
             h = self.norm(h)
+
+            seqlen = h.shape()[1]
+            if seqlen == 2048:
+                tile = [256, 128]
+                sram = 24672
+            elif seqlen == 128:
+                tile = [128, 128]
+                sram = 16480
+            else:
+                raise ValueError(f"Unsupported seqlen {seqlen}")
+
             with Context(
-                config={"Tile": [256, 128], "SramBytes": 24672, "NumWarps": 4}
+                config={"Tile": tile, "SramBytes": sram, "NumWarps": 4}
             ):
                 output = self.output(h)
             return output

From f428ea89fa0b8753504ba1295d6db710d2cad21e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 06:41:51 +0000
Subject: [PATCH 097/106] update

---
 examples/llama/model.py | 114 ++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 57 deletions(-)

diff --git a/examples/llama/model.py b/examples/llama/model.py
index fa52a522e..a3ed20392 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -531,61 +531,6 @@ def forward(
             self.dtype,
         )
 
-        if seqlen == 2048:
-            tile = [256, 128]
-            sram = 24672
-        elif seqlen == 128:
-            tile = [128, 64]
-            sram = 16480
-        else:
-            raise ValueError(f"Unsupported seqlen {seqlen}")
-
-        num_tile = seqlen * self.args.dim // tile[0] // tile[1]
-
-        with Context(processor_range=[0, NUM_SM]):
-            with Context(
-                processor_range=[0, NUM_SM // 3],
-                config={"NumWarps": 4},
-                sync=False,
-            ):
-                with Context(config={"SramBytes": sram, "Tile": tile}):
-                    xq = ark.matmul(x_norm, self.wq.weight, transpose_other=True)
-                xq = ark.reshape(
-                    xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
-                )
-                with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
-                    if freqs_cis is not None:
-                        xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
-
-                xq_scratch = ark.identity(xq_scratch, deps=[xq])
-
-            with Context(
-                processor_range=[NUM_SM // 3, 2 * NUM_SM // 3],
-                config={"NumWarps": 4},
-                sync=False,
-            ):
-                with Context(config={"SramBytes": sram, "Tile": tile}):
-                    xk = ark.matmul(x_norm, self.wk.weight, transpose_other=True)
-                xk = ark.reshape(
-                    xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-                )
-                with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
-                    if freqs_cis is not None:
-                        xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
-
-                xk_scratch = ark.identity(xk_scratch, deps=[xk])
-
-            with Context(
-                processor_range=[2 * NUM_SM // 3, NUM_SM],
-                config={"NumWarps": 4},
-                sync=False,
-            ):
-                with Context(config={"SramBytes": sram, "Tile": tile}):
-                    xv = ark.matmul(x_norm, self.wv.weight, transpose_other=True)
-                xv = ark.reshape(
-                    xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
-                )
-
         def calc_scores(xq_scratch, xk_scratch, mask):
             xq = xq_scratch[:, :, 0, :]
             xk = xk_scratch[:, :, 0, :]
@@ -646,8 +591,63 @@ def softmax(scores):
                     output = ark.div(tmp, sum)
             return output
 
-        scores = calc_scores(xq_scratch, xk_scratch, mask)
-        scores = softmax(scores)
+        if seqlen == 2048:
+            tile = [256, 128]
+            sram = 24672
+        elif seqlen == 128:
+            tile = [128, 64]
+            sram = 16480
+        else:
+            raise ValueError(f"Unsupported seqlen {seqlen}")
+
+        with Context(
+            processor_range=[0, 128],
+            config={"NumWarps": 4},
+            sync=False,
+        ):
+            with Context(config={"SramBytes": sram, "Tile": tile}):
+                xq = ark.matmul(x_norm, self.wq.weight, transpose_other=True)
+            xq = ark.reshape(
+                xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
+            )
+            with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+                if freqs_cis is not None:
+                    xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
+
+            xq_scratch = ark.identity(xq_scratch, deps=[xq])
+
+        with Context(
+            processor_range=[128, 256],
+            config={"NumWarps": 4},
+            sync=False,
+        ):
+            with Context(config={"SramBytes": sram, "Tile": tile}):
+                xk = ark.matmul(x_norm, self.wk.weight, transpose_other=True)
+            xk = ark.reshape(
+                xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+            with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+                if freqs_cis is not None:
+                    xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
+
+            xk_scratch = ark.identity(xk_scratch, deps=[xk])
+
+        with Context(
+            processor_range=[256, NUM_SM],
+            config={"NumWarps": 4},
+            sync=False,
+        ):
+            with Context(config={"SramBytes": sram, "Tile": tile}):
+                xv = ark.matmul(x_norm, self.wv.weight, transpose_other=True)
+            xv = ark.reshape(
+                xv, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
+            )
+
+        with Context(
+            processor_range=[0, 256],
+        ):
+            scores = calc_scores(xq_scratch, xk_scratch, mask)
+            scores = softmax(scores)
 
         output_scratch = ark.tensor(
             [

From 2075a051eeb6b7556b1b360ae813c08085acbf69 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 07:10:30 +0000
Subject: [PATCH 098/106] update

---
 examples/llama/model.py | 133 ++++++++++++++++++++++++++++------------
 1 file changed, 93 insertions(+), 40 deletions(-)

diff --git a/examples/llama/model.py b/examples/llama/model.py
index a3ed20392..cd1bede29 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -17,6 +17,7 @@
 WARP_SIZE = 64
 SRAM_PER_SM = 65536
 
+
 @dataclass
 class ModelArgs:
     dim: int = 4096
@@ -263,16 +264,22 @@ def forward(self, x: ark.Tensor):
                 num_parts = 1
             else:
                 min_elem_per_warp = WARP_SIZE * 2
-                max_warps_per_vec = (self.dim + min_elem_per_warp - 1) // min_elem_per_warp
+                max_warps_per_vec = (
+                    self.dim + min_elem_per_warp - 1
+                ) // min_elem_per_warp
                 warps_per_vec = min(max_warps_per_vec, NUM_WARPS // num_vecs)
                 if warps_per_vec <= NUM_WARPS_PER_SM:
-                    config.update({"NumWarps": warps_per_vec, "Tile": [self.dim]})
+                    config.update(
+                        {"NumWarps": warps_per_vec, "Tile": [self.dim]}
+                    )
                     num_parts = 1
                 else:
                     num_parts = warps_per_vec // NUM_WARPS_PER_SM
                     max_num_parts = 4
                     assert NUM_SM % max_num_parts == 0
-                    assert 2 ** (max_num_parts.bit_length() - 1) == max_num_parts
+                    assert (
+                        2 ** (max_num_parts.bit_length() - 1) == max_num_parts
+                    )
                     if num_parts > max_num_parts:
                         num_parts = max_num_parts
                     # make it max power of 2 smaller than num_parts
@@ -286,16 +293,34 @@ def forward(self, x: ark.Tensor):
             with Context(processor_range=[0, NUM_SM], config=config):
                 if num_parts == 1:
                     return ark.embedding(x, self.weight)
-                emb_output = ark.tensor([x.shape()[0], x.shape()[1], self.dim], self.dtype)
+                emb_output = ark.tensor(
+                    [x.shape()[0], x.shape()[1], self.dim], self.dtype
+                )
                 emb_parts = []
                 dim_per_part = self.dim // num_parts
                 for i in range(num_parts):
-                    with Context(processor_range=[i * NUM_SM // num_parts, (i + 1) * NUM_SM // num_parts]):
+                    with Context(
+                        processor_range=[
+                            i * NUM_SM // num_parts,
+                            (i + 1) * NUM_SM // num_parts,
+                        ]
+                    ):
                         emb_parts.append(
                             ark.embedding(
                                 x,
-                                self.weight[:, (i * dim_per_part) : ((i + 1) * dim_per_part)],
-                                emb_output[:, :, (i * dim_per_part) : ((i + 1) * dim_per_part)],
+                                self.weight[
+                                    :,
+                                    (i * dim_per_part) : (
+                                        (i + 1) * dim_per_part
+                                    ),
+                                ],
+                                emb_output[
+                                    :,
+                                    :,
+                                    (i * dim_per_part) : (
+                                        (i + 1) * dim_per_part
+                                    ),
+                                ],
                             )
                         )
                 return ark.identity(emb_output, deps=emb_parts)
@@ -385,37 +410,61 @@ def forward(self, x, ffn_norm):
         else:
             raise ValueError(f"Unsupported seqlen {seqlen}")
 
-        with Context(
-            processor_range=[0, NUM_SM], config={"NumWarps": 4},
-        ):
-            out_shape = h.shape()
-            out_shape[-1] = self.w1.out_dim
-            out = ark.tensor(out_shape, h.dtype())
-            pos = 0
-            for dim, tile, sram in schedule:
-                with Context(
-                    processor_range=[0, NUM_SM], sync=False, config={"Tile": tile}
-                ):
-                    h_shard = h[:, pos : pos + dim, :]
-                    out_shard = out[:, pos : pos + dim, :]
-                    with Context(config={"SramBytes": sram}):
-                        x1 = ark.matmul(
-                            h_shard, self.w1.weight, transpose_other=True
-                        )
-                    with Context(config={"SramBytes": 0}):
-                        x1 = Silu()(x1)
-                # We don't need a barrier here but somehow the performance is better with it
-                with Context(
-                    processor_range=[0, NUM_SM], sync=False, config={"Tile": tile}
-                ):
-                    with Context(config={"SramBytes": sram}):
-                        x2 = ark.matmul(
-                            h_shard, self.w3.weight, transpose_other=True
-                        )
-                    with Context(config={"SramBytes": 0}):
-                        x3 = ark.mul(x1, x2, out_shard)
-                    out = ark.identity(out, deps=[x3])
-                    pos += dim
+        out_shape = h.shape()
+        out_shape[-1] = self.w1.out_dim
+        out = ark.tensor(out_shape, h.dtype())
+        pos = 0
+
+        dim, tile, sram = schedule[0]
+
+        with Context(sync=False, config={"Tile": tile, "NumWarps": 4}):
+            h_shard = h[:, pos : pos + dim, :]
+            out_shard = out[:, pos : pos + dim, :]
+            with Context(config={"SramBytes": sram}):
+                x1 = ark.matmul(h_shard, self.w1.weight, transpose_other=True)
+            with Context(config={"SramBytes": 0}):
+                x1 = Silu()(x1)
+
+        # We don't need a barrier here but somehow the performance is better with it
+        with Context(sync=False, config={"Tile": tile, "NumWarps": 4}):
+            with Context(config={"SramBytes": sram}):
+                x2 = ark.matmul(h_shard, self.w3.weight, transpose_other=True)
+            with Context(config={"SramBytes": 0}):
+                x3 = ark.mul(x1, x2, out_shard)
+            out = ark.identity(out, deps=[x3])
+            pos += dim
+
+        if len(schedule) > 1:
+            dim, tile, sram = schedule[1]
+            with Context(
+                processor_range=[0, NUM_SM // 2],
+                sync=False,
+                config={"Tile": tile, "NumWarps": 4},
+            ):
+                h_shard = h[:, pos : pos + dim, :]
+                out_shard = out[:, pos : pos + dim, :]
+                with Context(config={"SramBytes": sram}):
+                    x1 = ark.matmul(
+                        h_shard, self.w1.weight, transpose_other=True
+                    )
+                with Context(config={"SramBytes": 0}):
+                    x1 = Silu()(x1)
+
+            with Context(
+                processor_range=[NUM_SM // 2, NUM_SM],
+                sync=False,
+                config={"Tile": tile, "NumWarps": 4, "SramBytes": sram},
+            ):
+                x2 = ark.matmul(h_shard, self.w3.weight, transpose_other=True)
+            with Context(
+                processor_range=[0, NUM_SM],
+                sync=False,
+                config={"Tile": tile, "NumWarps": 4},
+            ):
+                with Context(config={"SramBytes": 0}):
+                    x3 = ark.mul(x1, x2, out_shard)
+                out = ark.identity(out, deps=[x3])
+            pos += dim
 
         if seqlen == 2048:
             tile = [256, 128]
@@ -610,7 +659,9 @@ def softmax(scores):
             xq = ark.reshape(
                 xq, [bsz, seqlen, self.n_local_heads, self.head_dim]
             )
-            with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+            with Context(
+                config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}
+            ):
                 if freqs_cis is not None:
                     xq = ark.rope(xq, freqs_cis, xq_scratch[:, :seqlen, :, :])
 
@@ -626,7 +677,9 @@ def softmax(scores):
             xk = ark.reshape(
                 xk, [bsz, seqlen, self.n_local_kv_heads, self.head_dim]
             )
-            with Context(config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}):
+            with Context(
+                config={"SramBytes": 0, "Tile": [tile[0], 1, tile[1]]}
+            ):
                 if freqs_cis is not None:
                     xk = ark.rope(xk, freqs_cis, xk_scratch[:, :seqlen, :, :])
 

From 0e33e201e3dabda91f285910d5b1e5ef47eb2190 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 08:57:56 +0000
Subject: [PATCH 099/106] fixes

---
 python/ark/profiler.py | 21 +++++++++++++++------
 python/ark/runtime.py  |  5 +++--
 2 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/python/ark/profiler.py b/python/ark/profiler.py
index 5d7ab8366..da346cb7b 100644
--- a/python/ark/profiler.py
+++ b/python/ark/profiler.py
@@ -11,12 +11,21 @@
 
 def timeit(plan: Plan, iter: int, loop_mode: bool, warmup: int = 3):
     with Runtime() as rt:
-        rt.launch(plan=plan, loop_mode=loop_mode)
-        rt.run(iter=warmup)
-        start_time = time.time()
-        rt.run(iter=iter)
-        end_time = time.time()
-        return (end_time - start_time) / iter
+        if loop_mode:
+            rt.launch(plan=plan, loop_mode=loop_mode)
+            rt.run(iter=warmup)
+            rt.stop()
+            start_time = time.time()
+            rt.run(iter=iter)
+            elapsed = time.time() - start_time
+        else:
+            rt.launch(plan=plan, loop_mode=loop_mode)
+            rt.run(iter=warmup)
+            rt.stop()
+            rt.launch(plan=plan, loop_mode=loop_mode, record=True)
+            rt.run(iter=iter)
+            elapsed = rt.stop() / 1.0e3
+        return elapsed / iter
 
 
 class Profiler:
diff --git a/python/ark/runtime.py b/python/ark/runtime.py
index af1eb995e..0edfd26ec 100644
--- a/python/ark/runtime.py
+++ b/python/ark/runtime.py
@@ -65,6 +65,7 @@ def launch(
         device_id: int = -1,
         stream: int = 0,
         loop_mode: bool = True,
+        record: bool = False,
         tensor_mappings: Dict = {},
     ):
         """
@@ -93,7 +94,7 @@ def launch(
         exe = Executor.get()
         if plan_str != exe.plan() or device_id != exe.device_id():
             exe.compile(plan_str, device_id)
-        exe.launch(tensor_mappings, stream, loop_mode)
+        exe.launch(tensor_mappings, stream, loop_mode, record)
         self.state = Runtime.StateCode.LaunchedNotRunning
         self.loop_mode = loop_mode
 
@@ -148,7 +149,7 @@ def stop(self) -> float:
         """
         if not self.launched():
             log.WARN(f"ARK runtime is never launched, skip stopping")
-            return
+            return -1
         elapsed = Executor.get().stop()
         self.state = Runtime.StateCode.LaunchedNotRunning
         return elapsed

From dfc574d5aa4b20382419ef456bf409ca0f13266a Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 09:11:14 +0000
Subject: [PATCH 100/106] minor fix

---
 ark/api/planner.cpp | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/ark/api/planner.cpp b/ark/api/planner.cpp
index 24036b8f0..c2c98b216 100644
--- a/ark/api/planner.cpp
+++ b/ark/api/planner.cpp
@@ -196,13 +196,25 @@ std::string Planner::Impl::plan(bool pretty) const {
 
         auto &result_tensors = op->result_tensors();
         if (!result_tensors.empty() && config.contains("Tile")) {
+            const std::vector<DimType> tile_vec = config["Tile"];
+            std::vector<DimType> trim_leading_ones;
+            for (size_t i = 0; i < tile_vec.size(); i++) {
+                if (tile_vec[i] != 1) {
+                    trim_leading_ones = std::vector<DimType>(
+                        tile_vec.begin() + i, tile_vec.end());
+                    break;
+                }
+            }
+            if (trim_leading_ones.empty()) {
+                trim_leading_ones.push_back(1);
+            }
+            Dims tile(trim_leading_ones);
+
             std::stringstream ss;
-            ss << "Result shape is not divided by tile. Op: "
-               << op->serialize().dump();
+            ss << "Result shape is not divided by tile "
+               << tile << ". Op: " << op->serialize().dump();
             auto not_divided_error = ss.str();
 
-            const std::vector<DimType> tile_vec = config["Tile"];
-            auto tile = Dims(tile_vec);
             auto &result_shape = result_tensors[0]->padded_shape();
             if (result_shape.ndims() < tile.ndims()) {
                 ERR(PlanError, not_divided_error);

From 02bf517cc2ea3a7c3510354cf91a040bb131d77f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Mon, 30 Sep 2024 09:13:56 +0000
Subject: [PATCH 101/106] delete test

---
 examples/llama/test.py | 56 ------------------------------------------
 1 file changed, 56 deletions(-)
 delete mode 100644 examples/llama/test.py

diff --git a/examples/llama/test.py b/examples/llama/test.py
deleted file mode 100644
index 951dc0edc..000000000
--- a/examples/llama/test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
-
-import sys
-sys.path.append("llama")
-
-from typing import List, Optional
-
-import fire
-import time
-
-from llama import Llama
-import torch
-
-
-def main(
-    ckpt_dir: str,
-    tokenizer_path: str,
-    seq_len: int = 128,
-    batch_size: int = 256,
-    gen_len: int = 128,
-    warmup: int = 3,
-    iteration: int = 5,
-):
-    total_len = seq_len + gen_len
-
-    generator = Llama.build(
-        ckpt_dir=ckpt_dir,
-        tokenizer_path=tokenizer_path,
-        max_seq_len=total_len,
-        max_batch_size=batch_size,
-    )
-
-    tokens = torch.randint(
-        low=0, high=generator.tokenizer.n_words - 1, size=(batch_size, total_len), dtype=torch.int32
-    )
-
-    print(f"Profiling... (seq_len={seq_len}, batch_size={batch_size}, gen_len={gen_len}, warmup={warmup}, iteration={iteration})")
-
-    def gen():
-        _ = generator.model.forward(tokens[:, :seq_len], 0)
-        for pos in range(1, gen_len):
-            _ = generator.model.forward(tokens[:, (seq_len + pos - 1):(seq_len + pos)], pos)
-
-    for _ in range(warmup):
-        gen()
-    start = time.time()
-    for _ in range(iteration):
-        gen()
-    end = time.time()
-    print(f"Elapsed: {(end - start)/iteration:.5f} sec/iteration")
-
-
-
-if __name__ == "__main__":
-    fire.Fire(main)

From 7d05da82d8842aa4394e602075801f7d78a3b6e9 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Tue, 8 Oct 2024 23:31:59 +0000
Subject: [PATCH 102/106] updates

---
 examples/llama/model.py      |  6 ++---
 examples/llama/model_test.py | 44 ------------------------------------
 2 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/examples/llama/model.py b/examples/llama/model.py
index cd1bede29..ebd424612 100644
--- a/examples/llama/model.py
+++ b/examples/llama/model.py
@@ -405,7 +405,7 @@ def forward(self, x, ffn_norm):
             ]
         elif seqlen == 128:
             schedule = [
-                [128, [128, 64], 16480],
+                [128, [128, 64], 12384],
             ]
         else:
             raise ValueError(f"Unsupported seqlen {seqlen}")
@@ -471,7 +471,7 @@ def forward(self, x, ffn_norm):
             sram = 24672
         elif seqlen == 128:
             tile = [128, 64]
-            sram = 16480
+            sram = 12384
         else:
             raise ValueError(f"Unsupported seqlen {seqlen}")
 
@@ -645,7 +645,7 @@ def softmax(scores):
             sram = 24672
         elif seqlen == 128:
             tile = [128, 64]
-            sram = 16480
+            sram = 12384
         else:
             raise ValueError(f"Unsupported seqlen {seqlen}")
 
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
index 2ed2d0e63..6b7f3a5bb 100644
--- a/examples/llama/model_test.py
+++ b/examples/llama/model_test.py
@@ -363,49 +363,6 @@ def test_column_parallel_linear(
     )
 
 
-def test_attention(
-    args: ModelArgs,
-    batch_size: int,
-    seq_len: int,
-    dtype: np.dtype,
-    rank: int = 0,
-    world_size: int = 1,
-):
-    #
-    freqs_cis = precompute_freqs_cis(
-        args.dim // args.n_heads, args.max_seq_len * 2
-    )[0:seq_len]
-
-    freqs_cis_ark = freqs_cis.astype(np.complex64)
-    freqs_cis_ark = (
-        np.stack([freqs_cis_ark.real, freqs_cis_ark.imag], axis=-1)
-        .astype(dtype)
-        .reshape(1, seq_len, 1, args.dim // args.n_heads)
-    )
-
-    seed = 1695878986  # int(time.time())
-    print(f"seed: {seed}")
-    np.random.seed(seed)
-    feature = np.random.uniform(
-        low=-0.1, high=0.1, size=(batch_size, seq_len, args.dim)
-    ).astype(dtype)
-
-    test_module(
-        module_class_ark=model_ark.Attention,
-        module_args_ark=[
-            args,
-            ark.DataType.from_numpy(dtype),
-            rank,
-            world_size,
-        ],
-        inputs_ark=[feature, 0, freqs_cis_ark, None],
-        module_class_pt=model_pt.Attention,
-        module_args_pt=[args],
-        inputs_pt=[feature.astype(dtype), 0, freqs_cis, None],
-        module_name_prefix="layers.0.attention",
-    )
-
-
 def test_transformer(
     args: ModelArgs,
     batch_size: int,
@@ -472,7 +429,6 @@ def test(args, batch_size, seq_len, dtype, rank, world_size):
     # test_rmsnorm(args, batch_size, seq_len, dtype)
     # test_row_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
     # test_column_parallel_linear(args, batch_size, seq_len, dtype, rank, world_size)
-    # test_attention(args, batch_size, seq_len, dtype, rank, world_size)
     test_transformer(args, batch_size, seq_len, dtype, rank, world_size)
 
 

From e87f96bfa374167491a3a7a92280cf6f87fb4350 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Thu, 23 Jan 2025 23:35:33 +0000
Subject: [PATCH 103/106] updates

---
 CMakeLists.txt                          |  3 --
 ark/CMakeLists.txt                      |  4 +-
 ark/api/executor.cpp                    | 26 ++++++------
 ark/codegen.cpp                         |  4 +-
 ark/include/kernels/comm.h              |  4 +-
 examples/tutorial/planner_tutorial_2.py | 56 +++++++++++++++++++++++++
 third_party/CMakeLists.txt              | 14 +++----
 third_party/mscclpp                     |  2 +-
 8 files changed, 81 insertions(+), 32 deletions(-)
 create mode 100644 examples/tutorial/planner_tutorial_2.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3b09b0e6..8d5de19d1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -91,9 +91,6 @@ if(CMAKE_BUILD_TYPE MATCHES "Debug" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang
     target_link_options(coverage_config INTERFACE --coverage)
 endif()
 
-# Find ibverbs
-include(FindIBVerbs)
-
 # Find NUMA
 include(FindNUMA)
 
diff --git a/ark/CMakeLists.txt b/ark/CMakeLists.txt
index 9616ea875..cd780e64c 100644
--- a/ark/CMakeLists.txt
+++ b/ark/CMakeLists.txt
@@ -11,7 +11,7 @@ if(ARK_USE_ROCM)
     set_source_files_properties(${CU_SOURCES} PROPERTIES LANGUAGE CXX)
 endif()
 
-set(COMMON_LIBS ARK::numa ARK::ibverbs pthread rt)
+set(COMMON_LIBS ARK::numa pthread rt)
 
 # ARK object
 target_include_directories(ark_obj PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
@@ -20,7 +20,6 @@ target_include_directories(ark_obj SYSTEM PRIVATE
     ${DLPACK_INCLUDE_DIRS}
     ${JSON_INCLUDE_DIRS}
     ${MSCCLPP_INCLUDE_DIRS}
-    ${IBVERBS_INCLUDE_DIRS}
     ${NUMA_INCLUDE_DIRS}
 )
 
@@ -55,7 +54,6 @@ if(ARK_BUILD_TESTS)
         target_include_directories(${exe_name} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
         target_include_directories(${exe_name} SYSTEM PRIVATE
             ${JSON_INCLUDE_DIRS}
-            ${IBVERBS_INCLUDE_DIRS}
             ${NUMA_INCLUDE_DIRS}
         )
 
diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index af1789dc1..7fda54256 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -162,8 +162,7 @@ class CommResource {
 
     struct ConnectionResource {
         std::shared_ptr<mscclpp::Connection> connection;
-        std::vector<std::shared_ptr<mscclpp::SimpleProxyChannel>>
-            proxy_channels;
+        std::vector<std::shared_ptr<mscclpp::ProxyChannel>> proxy_channels;
         std::vector<std::shared_ptr<mscclpp::SmChannel>> sm_channels;
     };
 
@@ -312,11 +311,11 @@ void CommResource::connect(const PlanJson &plan_json,
             [&](std::shared_ptr<ConnectionResource> conn_resource) {
                 if (!conn_resource) return;
                 conn_resource->proxy_channels.push_back(
-                    std::make_shared<mscclpp::SimpleProxyChannel>(
+                    std::make_shared<mscclpp::ProxyChannel>(
                         proxy_service_->proxyChannel(
                             proxy_service_->buildAndAddSemaphore(
-                                *comm_, conn_resource->connection)),
-                        remote_regmem_id, regmem_id));
+                                *comm_, conn_resource->connection),
+                            remote_regmem_id, regmem_id)));
             };
         // NOTE: We can create multiple proxy channels here if we need in the
         // future
@@ -743,16 +742,15 @@ void PlanResource::init_kernel() {
     void *proxy_secondary_chan_addr =
         get_global_rt("ARK_PROXY_SECONDARY_CHANS");
     void *sm_chan_addr = get_global_rt("ARK_SM_CHANS");
-    std::vector<mscclpp::SimpleProxyChannel::DeviceHandle> proxy_handles(
+    std::vector<mscclpp::ProxyChannel::DeviceHandle> proxy_handles(world_size_);
+    std::vector<mscclpp::ProxyChannel::DeviceHandle> proxy_secondary_handles(
         world_size_);
-    std::vector<mscclpp::SimpleProxyChannel::DeviceHandle>
-        proxy_secondary_handles(world_size_);
     std::vector<mscclpp::SmChannel::DeviceHandle> sm_handles(world_size_);
     for (int i = 0; i < world_size_; i++) {
         if (i == rank_) continue;
         auto resource = comm_resource_->resource(i);
         if (!resource) continue;
-        std::vector<mscclpp::SimpleProxyChannel::DeviceHandle> p_hdls;
+        std::vector<mscclpp::ProxyChannel::DeviceHandle> p_hdls;
         if (resource->ipc) {
             sm_handles[i] = resource->ipc->sm_channels[0]->deviceHandle();
             p_hdls.push_back(resource->ipc->proxy_channels[0]->deviceHandle());
@@ -772,14 +770,14 @@ void PlanResource::init_kernel() {
     }
     auto tmp_stream = gpu_manager->create_stream();
     GLOG(gpuSetDevice(device_id_));
-    GLOG(gpuMemcpyAsync(proxy_chan_addr, proxy_handles.data(),
-                        proxy_handles.size() *
-                            sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
-                        gpuMemcpyHostToDevice, tmp_stream->get()));
+    GLOG(gpuMemcpyAsync(
+        proxy_chan_addr, proxy_handles.data(),
+        proxy_handles.size() * sizeof(mscclpp::ProxyChannel::DeviceHandle),
+        gpuMemcpyHostToDevice, tmp_stream->get()));
     GLOG(gpuMemcpyAsync(proxy_secondary_chan_addr,
                         proxy_secondary_handles.data(),
                         proxy_secondary_handles.size() *
-                            sizeof(mscclpp::SimpleProxyChannel::DeviceHandle),
+                            sizeof(mscclpp::ProxyChannel::DeviceHandle),
                         gpuMemcpyHostToDevice, tmp_stream->get()));
     GLOG(gpuMemcpyAsync(
         sm_chan_addr, sm_handles.data(),
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index 7ab2f5635..f131675f5 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -354,9 +354,9 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
 
 std::string CodeGenerator::Impl::def_channels(int world_size) {
     std::stringstream ss;
-    ss << "__constant__ mscclpp::SimpleProxyChannelDeviceHandle ";
+    ss << "__constant__ mscclpp::ProxyChannelDeviceHandle ";
     ss << "ARK_PROXY_CHANS[" << world_size << "];\n";
-    ss << "__constant__ mscclpp::SimpleProxyChannelDeviceHandle ";
+    ss << "__constant__ mscclpp::ProxyChannelDeviceHandle ";
     ss << "ARK_PROXY_SECONDARY_CHANS[" << world_size << "];\n";
     ss << "__constant__ mscclpp::SmChannelDeviceHandle ";
     ss << "ARK_SM_CHANS[" << world_size << "];\n";
diff --git a/ark/include/kernels/comm.h b/ark/include/kernels/comm.h
index 76fdfe27b..6ffbf1415 100644
--- a/ark/include/kernels/comm.h
+++ b/ark/include/kernels/comm.h
@@ -14,8 +14,8 @@
 #include "common/unit_op.h"
 #include "reduce.h"
 
-extern __constant__ mscclpp::SimpleProxyChannelDeviceHandle ARK_PROXY_CHANS[];
-extern __constant__ mscclpp::SimpleProxyChannelDeviceHandle
+extern __constant__ mscclpp::ProxyChannelDeviceHandle ARK_PROXY_CHANS[];
+extern __constant__ mscclpp::ProxyChannelDeviceHandle
     ARK_PROXY_SECONDARY_CHANS[];
 extern __constant__ mscclpp::SmChannelDeviceHandle ARK_SM_CHANS[];
 
diff --git a/examples/tutorial/planner_tutorial_2.py b/examples/tutorial/planner_tutorial_2.py
new file mode 100644
index 000000000..eb9998541
--- /dev/null
+++ b/examples/tutorial/planner_tutorial_2.py
@@ -0,0 +1,56 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import numpy as np
+import ark
+
+
+def quickstart_tutorial():
+    # Initialize the ARK environments
+    ark.init()
+
+    M, N, K = 1024, 1024, 1024
+    m0 = ark.tensor([M, K], ark.fp16)
+    m1 = ark.tensor([N, K], ark.fp16)
+    m2 = ark.tensor([M, K], ark.fp16)
+
+    # stage 1: matmul
+    with ark.PlannerContext(processor_range=[0, 108]):
+        # Use SMs 0~107 (all)
+        t0 = ark.matmul(m0, m1, transpose_other=True)
+
+    # stage 2: parallel copy and matmul
+    with ark.PlannerContext(processor_range=[0, 54]):
+        # Use SMs 0~53
+        t1 = ark.matmul(t0, m1)
+    with ark.PlannerContext(processor_range=[54, 108]):
+        # Use SMs 54~107
+        t2 = ark.copy(input=t0, output=m2)
+
+    # Initialize the ARK runtime
+    runtime = ark.Runtime()
+
+    # Launch the ARK runtime
+    runtime.launch()
+
+    # Initialize
+    m0_host = np.random.rand(M, K).astype(np.float16) * 0.01
+    m0.from_numpy(m0_host)
+    m1_host = np.random.rand(N, K).astype(np.float16) * 0.01
+    m1.from_numpy(m1_host)
+
+    # Run the ARK program
+    runtime.run()
+
+    # Check the matmul result
+    res_host = np.matmul(np.matmul(m0_host, m1_host.T), m1_host)
+    np.testing.assert_allclose(t1.to_numpy(), res_host, rtol=1e-3, atol=1e-3)
+
+    # Check the copy result
+    np.testing.assert_equal(t2.to_numpy(), t0.to_numpy())
+
+    print("Successful!")
+
+
+if __name__ == "__main__":
+    quickstart_tutorial()
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index 49251be74..f41acdc69 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -11,15 +11,15 @@ include(FetchContent)
 FetchContent_Declare(
     mscclpp
     GIT_REPOSITORY https://github.com/microsoft/mscclpp
-    GIT_TAG v0.5.2
+    GIT_TAG v0.6.0
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
 )
-set(BUILD_TESTS OFF CACHE BOOL "" FORCE)
-set(BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
-set(BUILD_APPS_NCCL OFF CACHE BOOL "" FORCE)
-set(USE_CUDA ${ARK_USE_CUDA} CACHE BOOL "" FORCE)
-set(USE_ROCM ${ARK_USE_ROCM} CACHE BOOL "" FORCE)
-set(BYPASS_GPU_CHECK ON CACHE BOOL "" FORCE)
+set(MSCCLPP_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+set(MSCCLPP_BUILD_PYTHON_BINDINGS OFF CACHE BOOL "" FORCE)
+set(MSCCLPP_BUILD_APPS_NCCL OFF CACHE BOOL "" FORCE)
+set(MSCCLPP_USE_CUDA ${ARK_USE_CUDA} CACHE BOOL "" FORCE)
+set(MSCCLPP_USE_ROCM ${ARK_USE_ROCM} CACHE BOOL "" FORCE)
+set(MSCCLPP_BYPASS_GPU_CHECK ON CACHE BOOL "" FORCE)
 set(INSTALL_PREFIX "ark")
 FetchContent_GetProperties(mscclpp)
 if (NOT mscclpp_POPULATED)
diff --git a/third_party/mscclpp b/third_party/mscclpp
index 40cb19655..11e62024d 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit 40cb1965538ab98fea3cc9fe004f730e23e84829
+Subproject commit 11e62024d3eb190e005b4689f8c8443d91a6c82e

From 2abc41ddcc05a4fb7b6955740c60bea0838f5a28 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 5 Feb 2025 04:37:12 +0000
Subject: [PATCH 104/106] updates

---
 ark/api/executor.cpp              | 28 +++++++++++-----------
 ark/codegen.cpp                   |  6 ++---
 ark/include/kernels/ark_kernels.h |  4 ++--
 ark/include/kernels/comm.h        | 16 ++++++-------
 ark/ops/ops_communication.cpp     |  4 ++--
 python/ark/ops.py                 | 40 +++++++++++++++++++++++++++++++
 python/model_py.cpp               |  7 ++++++
 third_party/CMakeLists.txt        |  2 +-
 third_party/mscclpp               |  2 +-
 9 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/ark/api/executor.cpp b/ark/api/executor.cpp
index 7fda54256..04e7e6d94 100644
--- a/ark/api/executor.cpp
+++ b/ark/api/executor.cpp
@@ -7,8 +7,8 @@
 #include <list>
 #include <memory>
 #include <mscclpp/core.hpp>
-#include <mscclpp/proxy_channel.hpp>
-#include <mscclpp/sm_channel.hpp>
+#include <mscclpp/memory_channel.hpp>
+#include <mscclpp/port_channel.hpp>
 #include <utility>
 
 #include "ark/data_type.hpp"
@@ -162,8 +162,8 @@ class CommResource {
 
     struct ConnectionResource {
         std::shared_ptr<mscclpp::Connection> connection;
-        std::vector<std::shared_ptr<mscclpp::ProxyChannel>> proxy_channels;
-        std::vector<std::shared_ptr<mscclpp::SmChannel>> sm_channels;
+        std::vector<std::shared_ptr<mscclpp::PortChannel>> proxy_channels;
+        std::vector<std::shared_ptr<mscclpp::MemoryChannel>> sm_channels;
     };
 
     struct RankResource {
@@ -311,8 +311,8 @@ void CommResource::connect(const PlanJson &plan_json,
             [&](std::shared_ptr<ConnectionResource> conn_resource) {
                 if (!conn_resource) return;
                 conn_resource->proxy_channels.push_back(
-                    std::make_shared<mscclpp::ProxyChannel>(
-                        proxy_service_->proxyChannel(
+                    std::make_shared<mscclpp::PortChannel>(
+                        proxy_service_->portChannel(
                             proxy_service_->buildAndAddSemaphore(
                                 *comm_, conn_resource->connection),
                             remote_regmem_id, regmem_id)));
@@ -340,7 +340,7 @@ void CommResource::connect(const PlanJson &plan_json,
         // NOTE: We can create multiple sm channels here if we need in the
         // future
         resource->ipc->sm_channels.push_back(
-            std::make_shared<mscclpp::SmChannel>(
+            std::make_shared<mscclpp::MemoryChannel>(
                 sm_semaphores[remote_rank][0],
                 rank_to_remote_regmem[remote_rank], regmem.data(), nullptr));
     }
@@ -742,15 +742,15 @@ void PlanResource::init_kernel() {
     void *proxy_secondary_chan_addr =
         get_global_rt("ARK_PROXY_SECONDARY_CHANS");
     void *sm_chan_addr = get_global_rt("ARK_SM_CHANS");
-    std::vector<mscclpp::ProxyChannel::DeviceHandle> proxy_handles(world_size_);
-    std::vector<mscclpp::ProxyChannel::DeviceHandle> proxy_secondary_handles(
+    std::vector<mscclpp::PortChannel::DeviceHandle> proxy_handles(world_size_);
+    std::vector<mscclpp::PortChannel::DeviceHandle> proxy_secondary_handles(
         world_size_);
-    std::vector<mscclpp::SmChannel::DeviceHandle> sm_handles(world_size_);
+    std::vector<mscclpp::MemoryChannel::DeviceHandle> sm_handles(world_size_);
     for (int i = 0; i < world_size_; i++) {
         if (i == rank_) continue;
         auto resource = comm_resource_->resource(i);
         if (!resource) continue;
-        std::vector<mscclpp::ProxyChannel::DeviceHandle> p_hdls;
+        std::vector<mscclpp::PortChannel::DeviceHandle> p_hdls;
         if (resource->ipc) {
             sm_handles[i] = resource->ipc->sm_channels[0]->deviceHandle();
             p_hdls.push_back(resource->ipc->proxy_channels[0]->deviceHandle());
@@ -772,16 +772,16 @@ void PlanResource::init_kernel() {
     GLOG(gpuSetDevice(device_id_));
     GLOG(gpuMemcpyAsync(
         proxy_chan_addr, proxy_handles.data(),
-        proxy_handles.size() * sizeof(mscclpp::ProxyChannel::DeviceHandle),
+        proxy_handles.size() * sizeof(mscclpp::PortChannel::DeviceHandle),
         gpuMemcpyHostToDevice, tmp_stream->get()));
     GLOG(gpuMemcpyAsync(proxy_secondary_chan_addr,
                         proxy_secondary_handles.data(),
                         proxy_secondary_handles.size() *
-                            sizeof(mscclpp::ProxyChannel::DeviceHandle),
+                            sizeof(mscclpp::PortChannel::DeviceHandle),
                         gpuMemcpyHostToDevice, tmp_stream->get()));
     GLOG(gpuMemcpyAsync(
         sm_chan_addr, sm_handles.data(),
-        sm_handles.size() * sizeof(mscclpp::SmChannel::DeviceHandle),
+        sm_handles.size() * sizeof(mscclpp::MemoryChannel::DeviceHandle),
         gpuMemcpyHostToDevice, tmp_stream->get()));
     GLOG(gpuStreamSynchronize(tmp_stream->get()));
 }
diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index f131675f5..ff2f84242 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -354,11 +354,11 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
 
 std::string CodeGenerator::Impl::def_channels(int world_size) {
     std::stringstream ss;
-    ss << "__constant__ mscclpp::ProxyChannelDeviceHandle ";
+    ss << "__constant__ mscclpp::PortChannelDeviceHandle ";
     ss << "ARK_PROXY_CHANS[" << world_size << "];\n";
-    ss << "__constant__ mscclpp::ProxyChannelDeviceHandle ";
+    ss << "__constant__ mscclpp::PortChannelDeviceHandle ";
     ss << "ARK_PROXY_SECONDARY_CHANS[" << world_size << "];\n";
-    ss << "__constant__ mscclpp::SmChannelDeviceHandle ";
+    ss << "__constant__ mscclpp::MemoryChannelDeviceHandle ";
     ss << "ARK_SM_CHANS[" << world_size << "];\n";
     return ss.str();
 }
diff --git a/ark/include/kernels/ark_kernels.h b/ark/include/kernels/ark_kernels.h
index 507de11b6..bf849a95a 100644
--- a/ark/include/kernels/ark_kernels.h
+++ b/ark/include/kernels/ark_kernels.h
@@ -6,8 +6,8 @@
 #ifndef ARK_KERNELS_H_
 #define ARK_KERNELS_H_
 
-#include <mscclpp/proxy_channel_device.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/memory_channel_device.hpp>
+#include <mscclpp/port_channel_device.hpp>
 
 #include "arithmetic.h"
 #include "cast.h"
diff --git a/ark/include/kernels/comm.h b/ark/include/kernels/comm.h
index 6ffbf1415..9075bb728 100644
--- a/ark/include/kernels/comm.h
+++ b/ark/include/kernels/comm.h
@@ -4,9 +4,9 @@
 #ifndef ARK_KERNELS_COMM_H_
 #define ARK_KERNELS_COMM_H_
 
+#include <mscclpp/memory_channel_device.hpp>
 #include <mscclpp/packet_device.hpp>
-#include <mscclpp/proxy_channel_device.hpp>
-#include <mscclpp/sm_channel_device.hpp>
+#include <mscclpp/port_channel_device.hpp>
 
 #include "common/atomic.h"
 #include "common/broadcast.h"
@@ -14,10 +14,10 @@
 #include "common/unit_op.h"
 #include "reduce.h"
 
-extern __constant__ mscclpp::ProxyChannelDeviceHandle ARK_PROXY_CHANS[];
-extern __constant__ mscclpp::ProxyChannelDeviceHandle
+extern __constant__ mscclpp::PortChannelDeviceHandle ARK_PROXY_CHANS[];
+extern __constant__ mscclpp::PortChannelDeviceHandle
     ARK_PROXY_SECONDARY_CHANS[];
-extern __constant__ mscclpp::SmChannelDeviceHandle ARK_SM_CHANS[];
+extern __constant__ mscclpp::MemoryChannelDeviceHandle ARK_SM_CHANS[];
 
 namespace ark {
 namespace comm {
@@ -251,7 +251,7 @@ template <typename InDims, typename InShape, typename OutDims,
           typename DataType>
 DEVICE void read(int ChanId, size_t remote_offset, size_t local_offset,
                  int uop_idx, [[maybe_unused]] int smem_per_warp) {
-    const mscclpp::SmChannelDeviceHandle &chan = ARK_SM_CHANS[ChanId];
+    const mscclpp::MemoryChannelDeviceHandle &chan = ARK_SM_CHANS[ChanId];
     char *local = reinterpret_cast<char *>(chan.src_) + local_offset;
     char *remote = reinterpret_cast<char *>(chan.dst_) + remote_offset;
     DataType *local_data = reinterpret_cast<DataType *>(local);
@@ -266,7 +266,7 @@ template <typename InDims, typename InShape, typename OutDims,
           typename DataType>
 DEVICE void write(int ChanId, size_t remote_offset, size_t local_offset,
                   int uop_idx, [[maybe_unused]] int smem_per_warp) {
-    const mscclpp::SmChannelDeviceHandle &chan = ARK_SM_CHANS[ChanId];
+    const mscclpp::MemoryChannelDeviceHandle &chan = ARK_SM_CHANS[ChanId];
     char *local = reinterpret_cast<char *>(chan.src_) + local_offset;
     char *remote = reinterpret_cast<char *>(chan.dst_) + remote_offset;
     DataType *local_data = reinterpret_cast<DataType *>(local);
@@ -282,7 +282,7 @@ template <typename InDims, typename InShape, typename OutDims,
 DEVICE void writePacket(int chan_id, size_t remote_offset, size_t local_offset,
                         int uop_idx, [[maybe_unused]] int smem_per_warp) {
     using Payload = typename PacketType::Payload;
-    const mscclpp::SmChannelDeviceHandle &chan = ARK_SM_CHANS[chan_id];
+    const mscclpp::MemoryChannelDeviceHandle &chan = ARK_SM_CHANS[chan_id];
     char *local = reinterpret_cast<char *>(chan.src_) + local_offset;
     char *remote = reinterpret_cast<char *>(chan.dst_) + remote_offset;
     Payload *local_data = reinterpret_cast<Payload *>(local);
diff --git a/ark/ops/ops_communication.cpp b/ark/ops/ops_communication.cpp
index baf7aafa2..c5be1ca65 100644
--- a/ark/ops/ops_communication.cpp
+++ b/ark/ops/ops_communication.cpp
@@ -41,8 +41,8 @@ ModelOpSend::ModelOpSend(ModelTensorRef input, int remote_rank, int tag,
 }
 
 std::string ModelOpSend::impl_name(const Json &config) const {
-    check_fields_config(config,
-                        {"ChannelType", "NumTasks", "NumWarps", "SramBytes"});
+    check_fields_config(
+        config, {"ChannelType", "Signal", "NumTasks", "NumWarps", "SramBytes"});
     auto &input = read_tensors_[0];
     auto &output = write_tensors_[0];
     int remote_rank = output->buffer()->rank();
diff --git a/python/ark/ops.py b/python/ark/ops.py
index 46145035a..c0eefa2e0 100644
--- a/python/ark/ops.py
+++ b/python/ark/ops.py
@@ -43,6 +43,9 @@
     "constant",
     "ones",
     "zeros",
+    "send",
+    "send_done",
+    "recv",
 ]
 
 
@@ -456,6 +459,43 @@ def transpose(
 ################################################################################
 
 
+def send(
+    input: Tensor,
+    remote_rank: int,
+    tag: int,
+    output: Tensor = NullTensor,
+    name: str = "send",
+):
+    """ """
+    if output is not NullTensor:
+        output = output._tensor
+    return Tensor(
+        Model.get_model().send(input._tensor, remote_rank, tag, output, name)
+    )
+
+
+def send_done(
+    input: Tensor,
+    name: str = "send_done",
+):
+    """ """
+    return Tensor(Model.get_model().send_done(input._tensor, name))
+
+
+def recv(
+    output: Tensor,
+    remote_rank: int,
+    tag: int,
+    name: str = "recv",
+):
+    """ """
+    return Tensor(
+        Model.get_model().recv(output._tensor, remote_rank, tag, name)
+    )
+
+################################################################################
+
+
 def mean(
     input: Tensor,
     axis: int,
diff --git a/python/model_py.cpp b/python/model_py.cpp
index 6568f3a5c..30b164460 100644
--- a/python/model_py.cpp
+++ b/python/model_py.cpp
@@ -128,6 +128,13 @@ void register_model(py::module &m) {
              py::arg("padded_shape"), py::arg("rank"), py::arg("name"))
         .def("transpose", &ark::Model::transpose, py::arg("input"),
              py::arg("permutation"), py::arg("output"), py::arg("name"))
+        .def("send", &ark::Model::send, py::arg("input"),
+             py::arg("remote_rank"), py::arg("tag"), py::arg("output"),
+             py::arg("name"))
+        .def("send_done", &ark::Model::send_done, py::arg("input"),
+             py::arg("name"))
+        .def("recv", &ark::Model::recv, py::arg("output"),
+             py::arg("remote_rank"), py::arg("tag"), py::arg("name"))
         .def("all_reduce", &ark::Model::all_reduce, py::arg("input"),
              py::arg("rank"), py::arg("world_size"), py::arg("output"),
              py::arg("name"));
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
index f41acdc69..dd090e874 100644
--- a/third_party/CMakeLists.txt
+++ b/third_party/CMakeLists.txt
@@ -11,7 +11,7 @@ include(FetchContent)
 FetchContent_Declare(
     mscclpp
     GIT_REPOSITORY https://github.com/microsoft/mscclpp
-    GIT_TAG v0.6.0
+    GIT_TAG 7f3b088744b184d595c0daeb2d721c2c8908f4bc
     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/mscclpp
 )
 set(MSCCLPP_BUILD_TESTS OFF CACHE BOOL "" FORCE)
diff --git a/third_party/mscclpp b/third_party/mscclpp
index 11e62024d..7f3b08874 160000
--- a/third_party/mscclpp
+++ b/third_party/mscclpp
@@ -1 +1 @@
-Subproject commit 11e62024d3eb190e005b4689f8c8443d91a6c82e
+Subproject commit 7f3b088744b184d595c0daeb2d721c2c8908f4bc

From 7d30e2c918ddb3c8c5c8e2671159ac256ed7d185 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 5 Feb 2025 05:29:12 +0000
Subject: [PATCH 105/106] a bug fix

---
 ark/codegen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ark/codegen.cpp b/ark/codegen.cpp
index ff2f84242..dc080d609 100644
--- a/ark/codegen.cpp
+++ b/ark/codegen.cpp
@@ -274,7 +274,6 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
         ss_hash_concat << std::hex << hash;
         op_hash_list.push_back(hash);
     }
-    size_t task_hash = std::hash<std::string>{}(ss_hash_concat.str());
     std::stringstream ss_desc;
     auto &buf_reg = BufferRegistry::get_instance();
     size_t op_idx = 0;
@@ -333,6 +332,7 @@ std::string CodeGenerator::Impl::def_task(const Json &task_json) {
         }
         ss_desc << "_idx, _spw);\n";
     }
+    size_t task_hash = std::hash<std::string>{}(ss_desc.str());
     if (task_hashes_.find(task_hash) == task_hashes_.end()) {
         ss << "__device__ void __task_" << std::hex << task_hash << std::dec
            << "(";

From d49f8183e040974c27c30317c64a38dd5ed95a5e Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 5 Feb 2025 21:57:21 +0000
Subject: [PATCH 106/106] update

---
 ark/include/kernels/kernel_template.in | 34 ++++++++++++++------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/ark/include/kernels/kernel_template.in b/ark/include/kernels/kernel_template.in
index a05e143d3..f0f5ec20c 100644
--- a/ark/include/kernels/kernel_template.in
+++ b/ark/include/kernels/kernel_template.in
@@ -8,22 +8,24 @@ template <size_t ProcBegin, size_t ProcEnd, size_t ProcStep, size_t ProcCurrent,
           size_t NumSlots, size_t SlotNumWarps, size_t SlotSramBytes,
           void (*task)(char*, int, int, @ARG_TYPES@)>
 __forceinline__ __device__ void task_seq(char *_buf, @GLOBAL_ARGS@) {
-  if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
-      ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
-    constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
-    constexpr size_t NumProcs = (ProcEnd - ProcBegin + ProcStep - 1) / ProcStep;
-    constexpr size_t SramBytesPerWarp = SlotSramBytes / SlotNumWarps;
-    size_t p = ((blockIdx.x + gridDim.x - ProcCurrent) % gridDim.x) / ProcStep;
-    size_t k = threadIdx.x / SlotNumThreads;
-    if constexpr (ARK_WARPS_PER_BLOCK > SlotNumWarps) {
-      if (k >= NumSlots) return;
-    }
-    size_t task_id_base = TaskBegin + p * TaskStep * TaskGranularity;
-    for (size_t t = k; ; t += NumSlots) {
-      size_t task_id = task_id_base + TaskStep *
-        (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
-      if (task_id >= TaskEnd) break;
-      task(_buf, task_id, SramBytesPerWarp, @FUNCTION_ARGS@);
+  if constexpr (TaskBegin != TaskEnd) {
+    if (math::geq<ProcBegin>(blockIdx.x) && math::le<ProcEnd>(blockIdx.x) &&
+        ((blockIdx.x - ProcBegin) % ProcStep == 0)) {
+      constexpr size_t SlotNumThreads = SlotNumWarps * Arch::ThreadsPerWarp;
+      constexpr size_t NumProcs = (ProcEnd - ProcBegin + ProcStep - 1) / ProcStep;
+      constexpr size_t SramBytesPerWarp = SlotSramBytes / SlotNumWarps;
+      size_t p = ((blockIdx.x + gridDim.x - ProcCurrent) % gridDim.x) / ProcStep;
+      size_t k = threadIdx.x / SlotNumThreads;
+      if constexpr (ARK_WARPS_PER_BLOCK > SlotNumWarps) {
+        if (k >= NumSlots) return;
+      }
+      size_t task_id_base = TaskBegin + p * TaskStep * TaskGranularity;
+      for (size_t t = k; ; t += NumSlots) {
+        size_t task_id = task_id_base + TaskStep *
+          (t % TaskGranularity + t / TaskGranularity * TaskGranularity * NumProcs);
+        if (task_id >= TaskEnd) break;
+        task(_buf, task_id, SramBytesPerWarp, @FUNCTION_ARGS@);
+      }
     }
   }
 }