Fix perf bugs (#167)

chhwang · web-flow · commit e69a5f625b15 · 2023-11-14T17:38:10.000+08:00
* Tackle perf degradation * Revert a wrong fix from #166 * Add `ARK_ENFORCE_KERNEL_CODE_PATH` feature for debugging
diff --git a/ark/env.cc b/ark/env.cc
@@ -19,6 +19,7 @@
 #define DEFAULT_ARK_DISABLE_GRAPH_OPT false
 #define DEFAULT_ARK_IGNORE_BINARY_CACHE false
 #define DEFAULT_ARK_SHM_NAME_PREFIX "ark."
+#define DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH ""
 #define DEFAULT_ARK_USE_MSLL false
 #define DEFAULT_ARK_MSLL_INCLUDE_DIR "/usr/local/msll/include"
 #define DEFAULT_ARK_MSLL_PORT 50051
@@ -75,6 +76,9 @@ Env::Env() {
     //
     this->shm_name_prefix =
         env<std::string>("ARK_SHM_NAME_PREFIX", DEFAULT_ARK_SHM_NAME_PREFIX);
+    //
+    this->enforce_kernel_code_path = env<std::string>(
+        "ARK_ENFORCE_KERNEL_CODE_PATH", DEFAULT_ARK_ENFORCE_KERNEL_CODE_PATH);
     // If `ARK_USE_MSLL=1`, we use MSLL.
     this->use_msll = env<bool>("ARK_USE_MSLL", DEFAULT_ARK_USE_MSLL);
     // Get the MSLL include directory path.
diff --git a/ark/env.h b/ark/env.h
@@ -35,6 +35,8 @@ struct Env {
     bool ignore_binary_cache;
     // Prefix of shared memory file names.
     std::string shm_name_prefix;
+    // Enforce to compile a specific kernel code file.
+    std::string enforce_kernel_code_path;
     // Use MSLL.
     bool use_msll;
     // MSLL include directory path.
diff --git a/ark/gpu/gpu_kernel.cc b/ark/gpu/gpu_kernel.cc
@@ -11,6 +11,7 @@
 
 #include "cpu_timer.h"
 #include "env.h"
+#include "file_io.h"
 #include "gpu/gpu_compile.h"
 #include "gpu/gpu_logging.h"
 #include "include/ark.h"
@@ -158,7 +159,11 @@ GpuLoopKernel::GpuLoopKernel(const string &name_,
 
     *(GpuPtr *)this->params[0] = this->flag->ref(0);
 
-    if (codes_body.size() > 0) {
+    auto &code_path = get_env().enforce_kernel_code_path;
+    if (!code_path.empty()) {
+        LOG(INFO, "Enforce kernel code path: ", code_path);
+        this->codes.emplace_back(read_file(code_path));
+    } else if (codes_body.size() > 0) {
         const string *ark_loop_body_code = nullptr;
         for (auto &code : codes_body) {
             if (code.find("ark_loop_body") == string::npos) {
diff --git a/ark/ops/ops_add.cc b/ark/ops/ops_add.cc
@@ -68,41 +68,7 @@ Tensor *Model::add(Tensor *input, Tensor *other, Tensor *output,
 }
 
 const OpConfigMap ArithmeticConfigMap = {
-    {{OP_ARCH_ANY, "fp32"},
-     {
-         // NumWarps, SmemBytes, InDepsTiles, OutDepsTiles, SyncPre, SyncPost
-         {8, 0, {{128, 256}, {128, 256}}, {{128, 256}}, false, false},
-         {8, 0, {{256, 128}, {256, 128}}, {{256, 128}}, false, false},
-         {8, 0, {{128, 128}, {128, 128}}, {{128, 128}}, false, false},
-         {4, 0, {{64, 64}, {64, 64}}, {{64, 64}}, false, false},
-         {2, 0, {{32, 64}, {32, 64}}, {{32, 64}}, false, false},
-         {1, 0, {{16, 64}, {16, 64}}, {{16, 64}}, false, false},
-         {1, 0, {{8, 64}, {8, 64}}, {{8, 64}}, false, false},
-         {1, 0, {{2, 128}, {2, 128}}, {{2, 128}}, false, false},
-         {1, 0, {{4, 64}, {4, 64}}, {{4, 64}}, false, false},
-         {1, 0, {{2, 64}, {2, 64}}, {{2, 64}}, false, false},
-         {1, 0, {{1, 128}, {1, 128}}, {{1, 128}}, false, false},
-         {1, 0, {{1, 64}, {1, 64}}, {{1, 64}}, false, false},
-         {1, 0, {{1, 32}, {1, 32}}, {{1, 32}}, false, false},
-     }},
-    {{OP_ARCH_ANY, "fp16"},
-     {
-         // NumWarps, SmemBytes, InDepsTiles, OutDepsTiles, SyncPre, SyncPost
-         {8, 0, {{128, 256}, {128, 256}}, {{128, 256}}, false, false},
-         {8, 0, {{256, 128}, {256, 128}}, {{256, 128}}, false, false},
-         {8, 0, {{128, 128}, {128, 128}}, {{128, 128}}, false, false},
-         {4, 0, {{64, 64}, {64, 64}}, {{64, 64}}, false, false},
-         {2, 0, {{32, 64}, {32, 64}}, {{32, 64}}, false, false},
-         {1, 0, {{16, 64}, {16, 64}}, {{16, 64}}, false, false},
-         {1, 0, {{8, 64}, {8, 64}}, {{8, 64}}, false, false},
-         {1, 0, {{2, 128}, {2, 128}}, {{2, 128}}, false, false},
-         {1, 0, {{4, 64}, {4, 64}}, {{4, 64}}, false, false},
-         {1, 0, {{2, 64}, {2, 64}}, {{2, 64}}, false, false},
-         {1, 0, {{1, 256}, {1, 256}}, {{1, 256}}, false, false},
-         {1, 0, {{1, 128}, {1, 128}}, {{1, 128}}, false, false},
-         {1, 0, {{1, 64}, {1, 64}}, {{1, 64}}, false, false},
-     }},
-    {{OP_ARCH_ANY, "bf16"},
+    {{OP_ARCH_ANY, "any"},
      {
          // NumWarps, SmemBytes, InDepsTiles, OutDepsTiles, SyncPre, SyncPost
          {8, 0, {{128, 256}, {128, 256}}, {{128, 256}}, false, false},
diff --git a/ark/ops/ops_cast.cc b/ark/ops/ops_cast.cc
@@ -159,19 +159,9 @@ const OpConfigMap CastConfigMap = {
     {{OP_ARCH_ANY, "none"},
      {
          // NumWarps, SmemBytes, InDepsTiles, OutDepsTiles, SyncPre, SyncPost
-         {8, 0, {{128, 256}}, {{128, 256}}, false, false},
-         {8, 0, {{256, 128}}, {{256, 128}}, false, false},
-         {8, 0, {{128, 128}}, {{128, 128}}, false, false},
-         {4, 0, {{64, 64}}, {{64, 64}}, false, false},
-         {2, 0, {{32, 64}}, {{32, 64}}, false, false},
-         {1, 0, {{16, 64}}, {{16, 64}}, false, false},
-         {1, 0, {{8, 64}}, {{8, 64}}, false, false},
-         {1, 0, {{2, 128}}, {{2, 128}}, false, false},
-         {1, 0, {{4, 64}}, {{4, 64}}, false, false},
-         {1, 0, {{2, 64}}, {{2, 64}}, false, false},
+         {1, 0, {{1, 256}}, {{1, 256}}, false, false},
          {1, 0, {{1, 128}}, {{1, 128}}, false, false},
          {1, 0, {{1, 64}}, {{1, 64}}, false, false},
-         {1, 0, {{1, 32}}, {{1, 32}}, false, false},
      }},
 };
 
diff --git a/ark/sched/sched.cc b/ark/sched/sched.cc
@@ -102,20 +102,16 @@ const OpConfig *BaseScheduler::sched_op_config(const Op *op) {
         }
     }
     // Heuristic auto-selection of granularity level
-    unsigned int min_wps =
-        gpu_info.min_threads_per_block / gpu_info.threads_per_warp;
     Dims shape4 = output->shape.dims4();
     Dims ldims4 = output->ldims.dims4();
+    DimType shape_x = shape4[2];
+    DimType shape_y = shape4[3];
     std::vector<std::tuple<const OpConfig *, Dims, int>> config_candidates;
-    std::vector<std::tuple<const OpConfig *, Dims, int>>
-        high_priority_candidates;
     for (auto &cfg : feasible_configs) {
         assert(cfg->output_tiles.size() > 0);
         const OpTile &ot = cfg->output_tiles[0];
         DimType ot_x = (ot.x == -1) ? ldims4[2] : ot.x;
         DimType ot_y = (ot.y == -1) ? ldims4[3] : ot.y;
-        DimType shape_x = shape4[2];
-        DimType shape_y = shape4[3];
         if (output->shape.ndims() == 1 && ot_x != 1) {
             // Output is 1D, but tile is 2D. Cannot use this tile shape.
             continue;
@@ -126,13 +122,6 @@ const OpConfig *BaseScheduler::sched_op_config(const Op *op) {
 
         // This config is OK to use
         config_candidates.emplace_back(cfg, Dims(ot_x, ot_y), num_tiles);
-
-        // magic condition
-        if ((shape_y * 2 > ot_y) && (shape_x * 2 > ot_x) &&
-            ((num_tiles * cfg->num_warps) >= (min_wps * gpu_info.num_sm / 2))) {
-            high_priority_candidates.emplace_back(cfg, Dims(ot_x, ot_y),
-                                                  num_tiles);
-        }
     }
     if (config_candidates.empty()) {
         stringstream configs_str;
@@ -152,14 +141,33 @@ const OpConfig *BaseScheduler::sched_op_config(const Op *op) {
         ERR(SchedulerError, "no valid tile configuration found. Output shape ",
             output->shape, ", available tiles: ", configs_str.str());
     }
+    std::vector<std::tuple<const OpConfig *, Dims, int>>
+        high_priority_candidates;
+    int min_wps = gpu_info.min_threads_per_block / gpu_info.threads_per_warp;
+    int target_concurrent_num_warps = min_wps * gpu_info.num_sm;
+    for (auto &c : config_candidates) {
+        auto &cfg = std::get<0>(c);
+        auto &tile = std::get<1>(c);
+        auto &num_tiles = std::get<2>(c);
+
+        if ((shape_x < tile[0]) || (shape_y < tile[1])) {
+            // too large tile.
+            continue;
+        }
+        auto num_total_warps = num_tiles * cfg->num_warps;
+        if (num_total_warps >= target_concurrent_num_warps / 2) {
+            high_priority_candidates.push_back(c);
+        }
+    }
     auto &candidates = high_priority_candidates.empty()
                            ? config_candidates
                            : high_priority_candidates;
-    // prefer smaller tiles here to minimize paddings
+
     std::sort(candidates.begin(), candidates.end(),
               [](const std::tuple<const OpConfig *, Dims, int> &a,
                  const std::tuple<const OpConfig *, Dims, int> &b) {
-                  return std::get<1>(a).size() < std::get<1>(b).size();
+                  return std::get<2>(a) * std::get<0>(a)->num_warps <
+                         std::get<2>(b) * std::get<0>(b)->num_warps;
               });
     return std::get<0>(candidates[0]);
 }
diff --git a/ark/sched/sched/sched_default.cc b/ark/sched/sched/sched_default.cc
@@ -621,10 +621,8 @@ std::vector<std::string> DefaultScheduler::gen_code() {
         auto comp_streams = this->comp_stream[i]->get_streams();
         for (size_t j = 0; j < comp_streams.size(); ++j) {
             auto &stream = comp_streams[j];
-            int prev_sm_id_end = -1;
             for (auto &branch : stream.branches) {
-                this->codegen->branch(code, branch, prev_sm_id_end);
-                prev_sm_id_end = branch.sm_id_end;
+                this->codegen->branch(code, branch);
             }
             if (!stream.branches.empty() && j != comp_streams.size() - 1) {
                 code << "  ";
@@ -634,10 +632,8 @@ std::vector<std::string> DefaultScheduler::gen_code() {
         auto comm_streams = this->comm_stream[i]->get_streams();
         for (size_t j = 0; j < comm_streams.size(); ++j) {
             auto &stream = comm_streams[j];
-            int prev_sm_id_end = -1;
             for (auto &branch : stream.branches) {
-                this->codegen->branch(code, branch, prev_sm_id_end);
-                prev_sm_id_end = branch.sm_id_end;
+                this->codegen->branch(code, branch);
             }
             if (!stream.branches.empty() && j != comm_streams.size() - 1) {
                 code << "  ";
diff --git a/examples/llama/model_test.py b/examples/llama/model_test.py
@@ -128,6 +128,7 @@ def test_module(
     module_name_prefix: str = "",
     test_thru: bool = False,
     test_thru_iterations: int = 100,
+    test_thru_ark_only: bool = False,
 ):
     if test_thru:
         print(f"Throughput test (iterations: {test_thru_iterations})")
@@ -164,21 +165,25 @@ def test_module(
         iterations=test_thru_iterations if test_thru else 1,
     )
 
-    # PyTorch module
-    module_pt: torch.nn.Module = module_class_pt(*module_args_pt)
+    if not test_thru_ark_only:
+        # PyTorch module
+        module_pt: torch.nn.Module = module_class_pt(*module_args_pt)
 
-    # Run the PyTorch module
-    res_pt = run_pt(
-        module_pt,
-        state_dict_pt,
-        inputs_pt,
-        iterations=test_thru_iterations if test_thru else 1,
-    )
-
-    if test_thru:
-        print(
-            f"  PyTorch: {res_pt.runtime:.4f} seconds, ARK: {res_ark.runtime:.4f} seconds"
+        # Run the PyTorch module
+        res_pt = run_pt(
+            module_pt,
+            state_dict_pt,
+            inputs_pt,
+            iterations=test_thru_iterations if test_thru else 1,
         )
+
+        if test_thru:
+            print(
+                f"  PyTorch: {res_pt.runtime:.4f} seconds, ARK: {res_ark.runtime:.4f} seconds"
+            )
+            return
+    elif test_thru:
+        print(f"  ARK: {res_ark.runtime:.4f} seconds")
         return
 
     # Compare the outputs
@@ -454,8 +459,6 @@ def test_transformer(
             module_class_pt=model_pt.Transformer,
             module_args_pt=[args],
             inputs_pt=[tokens, start_pos],
-            test_thru=True,
-            test_thru_iterations=10,
         )