Reverting NDS related changes

sayedbilalbari · sayedbilalbari · commit fc11bba0ae86 · 2025-04-18T16:09:03.000-07:00
Signed-off-by: Sayed Bilal Bari &lt;sayedbilalbari@gmail.com&gt;
diff --git a/nds/PysparkBenchReport.py b/nds/PysparkBenchReport.py
diff --git a/nds/PysparkBenchReport.py b/nds/PysparkBenchReport.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----
+#
+# Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
+# (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
+# Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
+# and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
+# available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
+#
+# You may not use this file except in compliance with the TPC EULA.
+# DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
+# obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
+# obtained from using this file do not comply with the TPC-DS Benchmark.
+#
+
+import json
+import os
+import time
+import traceback
+from typing import Callable
+from pyspark.sql import SparkSession
+
+import python_listener
+
+class PysparkBenchReport:
+    """Class to generate json summary report for a benchmark
+    """
+    def __init__(self, spark_session: SparkSession, query_name) -> None:
+        self.spark_session = spark_session
+        self.summary = {
+            'env': {
+                'envVars': {},
+                'sparkConf': {},
+                'sparkVersion': None
+            },
+            'queryStatus': [],
+            'exceptions': [],
+            'startTime': None,
+            'queryTimes': [],
+            'query': query_name,
+        }
+
+    def report_on(self, fn: Callable, warmup_iterations = 0, iterations = 1, *args):
+        """Record a function for its running environment, running status etc. and exclude sentive
+        information like tokens, secret and password Generate summary in dict format for it.
+
+        Args:
+            fn (Callable): a function to be recorded
+
+        Returns:
+            dict: summary of the fn
+        """
+        spark_conf = dict(self.spark_session.sparkContext._conf.getAll())
+        env_vars = dict(os.environ)
+        redacted = ["TOKEN", "SECRET", "PASSWORD"]
+        filtered_env_vars = dict((k, env_vars[k]) for k in env_vars.keys() if not (k in redacted))
+        self.summary['env']['envVars'] = filtered_env_vars
+        self.summary['env']['sparkConf'] = spark_conf
+        self.summary['env']['sparkVersion'] = self.spark_session.version
+        listener = None
+        try:
+            listener = python_listener.PythonListener()
+            listener.register()
+        except TypeError as e:
+            print("Not found com.nvidia.spark.rapids.listener.Manager", str(e))
+            listener = None
+        if listener is not None:
+            print("TaskFailureListener is registered.")
+        try:
+            # warmup
+            for i in range(0, warmup_iterations):
+                fn(*args)
+        except Exception as e:
+            print('ERROR WHILE WARMUP BEGIN')
+            print(e)
+            traceback.print_tb(e.__traceback__)
+            print('ERROR WHILE WARMUP END')
+
+        start_time = int(time.time() * 1000)
+        self.summary['startTime'] = start_time
+        # run the query
+        for i in range(0, iterations):
+            try:
+                start_time = int(time.time() * 1000)
+                fn(*args)
+                end_time = int(time.time() * 1000)
+                if listener and len(listener.failures) != 0:
+                    self.summary['queryStatus'].append("CompletedWithTaskFailures")
+                else:
+                    self.summary['queryStatus'].append("Completed")
+            except Exception as e:
+                # print the exception to ease debugging
+                print('ERROR BEGIN')
+                print(e)
+                traceback.print_tb(e.__traceback__)
+                print('ERROR END')
+                end_time = int(time.time() * 1000)
+                self.summary['queryStatus'].append("Failed")
+                self.summary['exceptions'].append(str(e))
+            finally:
+                self.summary['queryTimes'].append(end_time - start_time)
+        if listener is not None:
+            listener.unregister()
+        return self.summary
+
+    def write_summary(self, prefix=""):
+        """_summary_
+
+        Args:
+            query_name (str): name of the query
+            prefix (str, optional): prefix for the output json summary file. Defaults to "".
+        """
+        # Power BI side is retrieving some information from the summary file name, so keep this file
+        # name format for pipeline compatibility
+        filename = prefix + '-' + self.summary['query'] + '-' +str(self.summary['startTime']) + '.json'
+        self.summary['filename'] = filename
+        with open(filename, "w") as f:
+            json.dump(self.summary, f, indent=2)
+
+    def is_success(self):
+        """Check if the query succeeded, queryStatus == Completed
+        """
+        return self.summary['queryStatus'][0] == 'Completed'
diff --git a/nds/README.md b/nds/README.md
@@ -301,7 +301,7 @@ finished. This is often used for test or query monitoring purpose.
 To build:
 
 ```bash
-cd utils/jvm_listener
+cd jvm_listener
 mvn package
 ```
 
@@ -353,7 +353,7 @@ nds_power.py \
 parquet_sf3k \
 ./nds_query_streams/query_0.sql \
 time.csv \
---property_file ../utils/properties/aqe-on.properties
+--property_file properties/aqe-on.properties
 ```
 
 User can also use `spark-submit` to submit `nds_power.py` directly.
@@ -364,7 +364,7 @@ Note the template file must follow the `spark-submit-template` utility as the _f
 All Spark configuration words (such as `--conf` and corresponding `k=v` values)  are quoted by
 double quotes in the template file. Please follow the format in [power_run_gpu.template](./power_run_gpu.template).
 
-User can define the `properties` file like [aqe-on.properties](../utils/properties/aqe-on.properties). The properties will be passed to the submitted Spark job along with the configurations defined in the template file. User can define some common properties in the template file and put some other properties that usually varies in the property file.
+User can define the `properties` file like [aqe-on.properties](./properties/aqe-on.properties). The properties will be passed to the submitted Spark job along with the configurations defined in the template file. User can define some common properties in the template file and put some other properties that usually varies in the property file.
 
 The command above will use `collect()` action to trigger Spark job for each query. It is also supported to save query output to some place for further verification. User can also specify output format e.g. csv, parquet or orc:
 
diff --git a/nds/base.template b/nds/base.template
@@ -31,7 +31,7 @@ export NUM_EXECUTORS=${NUM_EXECUTORS:-8}
 export EXECUTOR_MEMORY=${EXECUTOR_MEMORY:-16G}
 
 # The NDS listener jar which is built in jvm_listener directory.
-export NDS_LISTENER_JAR=${NDS_LISTENER_JAR:-../utils/jvm_listener/target/benchmark-listener-1.0-SNAPSHOT.jar}
+export NDS_LISTENER_JAR=${NDS_LISTENER_JAR:-./jvm_listener/target/nds-benchmark-listener-1.0-SNAPSHOT.jar}
 # The spark-rapids jar which is required when running on GPU
 export SPARK_RAPIDS_PLUGIN_JAR=${SPARK_RAPIDS_PLUGIN_JAR:-rapids-4-spark_2.12-22.06.0.jar}
 export PYTHONPATH=$SPARK_HOME/python:`echo $SPARK_HOME/python/lib/py4j-*.zip`
diff --git a/nds/check.py b/nds/check.py
diff --git a/nds/check.py b/nds/check.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+#
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# -----
+#
+# Certain portions of the contents of this file are derived from TPC-DS version 3.2.0
+# (retrieved from www.tpc.org/tpc_documents_current_versions/current_specifications5.asp).
+# Such portions are subject to copyrights held by Transaction Processing Performance Council (“TPC”)
+# and licensed under the TPC EULA (a copy of which accompanies this file as “TPC EULA” and is also
+# available at http://www.tpc.org/tpc_documents_current_versions/current_specifications5.asp) (the “TPC EULA”).
+#
+# You may not use this file except in compliance with the TPC EULA.
+# DISCLAIMER: Portions of this file is derived from the TPC-DS Benchmark and as such any results
+# obtained using this file are not comparable to published TPC-DS Benchmark results, as the results
+# obtained from using this file do not comply with the TPC-DS Benchmark.
+#
+
+import argparse
+import os
+import sys
+from pathlib import Path
+
+
+def check_version():
+    req_ver = (3, 6)
+    cur_ver = sys.version_info
+    if cur_ver < req_ver:
+        raise Exception('Minimum required Python version is 3.6, but current python version is {}.'
+                        .format(str(cur_ver.major) + '.' + str(cur_ver.minor)) +
+                        ' Please use proper Python version')
+
+
+def check_build():
+    """check jar and tpcds executable
+
+    Raises:
+        Exception: the build is not done or broken
+
+    Returns:
+        PosixPath, PosixPath: path of jar and dsdgen executable
+    """
+    # Check if necessary executable or jars are built.
+    # we assume user won't move this script.
+    src_dir = Path(__file__).parent.absolute()
+    jar_path = list(
+        Path(src_dir / 'tpcds-gen/target').rglob("tpcds-gen-*.jar"))
+    tool_path = list(Path(src_dir / 'tpcds-gen/target/tools').rglob("dsdgen"))
+    if jar_path == [] or tool_path == []:
+        raise Exception('Target jar file is not found in `target` folder or dsdgen executable is ' +
+                        'not found in `target/tools` folder.' +
+                        'Please refer to README document and build this project first.')
+    return jar_path[0], tool_path[0]
+
+
+def get_abs_path(input_path):
+    """receive a user input path and return absolute path of it.
+
+    Args:
+        input_path (str): user's input path
+
+    Returns:
+        str: if the input is absolute, return it; if it's relative path, return the absolute path of
+        it.
+    """
+    if Path(input_path).is_absolute():
+        # it's absolute path
+        output_path = input_path
+    else:
+        # it's relative path where this script is executed
+        output_path = os.getcwd() + '/' + input_path
+    return output_path
+
+
+def valid_range(range, parallel):
+    """check the range validation
+
+    Args:
+        range (str): a range specified for a range data generation, e.g. "1,10"
+        parallel (str): string type number for parallelism in TPC-DS data generation, e.g. "20"
+
+    Raises:
+        Exception: error message for invalid range input.
+    """
+    if len(range.split(',')) != 2:
+        msg = 'Invalid range: please specify a range with a comma between start and end. e.g., "1,10".'
+        raise Exception(msg)
+    range_start = int(range.split(',')[0])
+    range_end = int(range.split(',')[1])
+    if range_start < 1 or range_start > range_end or range_end > int(parallel):
+        msg = 'Please provide correct child range: 1 <= range_start <= range_end <= parallel'
+        raise Exception(msg)
+    return range_start, range_end
+
+
+def parallel_value_type(p):
+    """helper function to check parallel valuie
+
+    Args:
+        p (str): parallel value
+
+    Raises:
+        argparse.ArgumentTypeError: ArgumentTypeError exception
+
+    Returns:
+        str: parallel in string
+    """
+    if int(p) < 2:
+        raise argparse.ArgumentTypeError("PARALLEL must be >= 2")
+    return p
+
+
+def get_dir_size(start_path):
+    total_size = 0
+    for dirpath, dirnames, filenames in os.walk(start_path):
+        for f in filenames:
+            fp = os.path.join(dirpath, f)
+            # skip if it is symbolic link
+            if not os.path.islink(fp):
+                total_size += os.path.getsize(fp)
+    return total_size
+
+def check_json_summary_folder(json_summary_folder):
+    if json_summary_folder:
+    # prepare a folder to save json summaries of query results
+        if not os.path.exists(json_summary_folder):
+            os.makedirs(json_summary_folder)
+        else:
+            if os.listdir(json_summary_folder):
+                raise Exception(f"json_summary_folder {json_summary_folder} is not empty. " +
+                                "There may be already some json files there. Please clean the folder " +
+                                "or specify another one.")
+
+def check_query_subset_exists(query_dict, subset_list):
+    """check if the query subset exists in the query dictionary"""
+    for q in subset_list:
+        if q not in query_dict.keys():
+            raise Exception(f"Query {q} is not in the query dictionary. Please check the query subset.")
+    return True
diff --git a/nds/nds_gen_data.py b/nds/nds_gen_data.py
@@ -35,7 +35,7 @@
 import shutil
 import subprocess
 
-from check import check_build_nds, check_version, get_abs_path, get_dir_size, parallel_value_type, valid_range
+from check import check_build, check_version, get_abs_path, get_dir_size, parallel_value_type, valid_range
 
 check_version()
 
diff --git a/nds/nds_gen_query_stream.py b/nds/nds_gen_query_stream.py
@@ -35,7 +35,7 @@
 import subprocess
 import sys
 
-from check import check_build_nds, check_version, get_abs_path
+from check import check_build, check_version, get_abs_path
 
 check_version()
 
diff --git a/nds/nds_maintenance.py b/nds/nds_maintenance.py
@@ -36,7 +36,6 @@
 import os
 
 from pyspark.sql import SparkSession
-
 from PysparkBenchReport import PysparkBenchReport
 
 from check import check_json_summary_folder, get_abs_path