Skip to content

Commit 8dc7396

Browse files
committed
Add sub_query_patterns for regex support for query filtering
1 parent c185ea9 commit 8dc7396

File tree

1 file changed

+32
-11
lines changed

1 file changed

+32
-11
lines changed

nds/nds_power.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -363,17 +363,26 @@ def deduplicate(column_names):
363363
dedup_col_names = deduplicate(valid_col_names)
364364
return df.toDF(*dedup_col_names)
365365

366+
366367
def get_query_subset(query_dict, subset):
368+
"""Get a subset of queries from query_dict.
369+
The subset is specified by a list of query names.
370+
"""
371+
check_query_subset_exists(query_dict, subset)
372+
return dict((k, query_dict[k]) for k in subset)
373+
374+
375+
def get_query_subset_by_pattern(query_dict, patterns):
367376
"""Get a subset of queries from query_dict.
368377
The subset is specified by a list of regex patterns for the query name.
369378
"""
370379
selected_queries = OrderedDict()
371-
for pattern in subset:
380+
for pattern in patterns:
372381
for query_name in query_dict.keys():
373382
if re.match(pattern, query_name):
374383
selected_queries[query_name] = query_dict[query_name]
375384
if not selected_queries:
376-
msg = f"No query matched the specified subset patterns: {subset}"
385+
msg = f"No query matched the specified subset patterns: {patterns}"
377386
raise Exception(msg)
378387
return selected_queries
379388

@@ -383,6 +392,7 @@ def run_query_stream(input_prefix,
383392
time_log_output_path,
384393
extra_time_log_output_path,
385394
sub_queries,
395+
sub_query_patterns,
386396
warmup_iterations,
387397
iterations,
388398
plan_types,
@@ -452,7 +462,9 @@ def run_query_stream(input_prefix,
452462
check_json_summary_folder(json_summary_folder)
453463
if sub_queries:
454464
query_dict = get_query_subset(query_dict, sub_queries)
455-
465+
if sub_query_patterns:
466+
query_dict = get_query_subset_by_pattern(query_dict, sub_query_patterns)
467+
456468
# Setup profiler
457469
profiler = Profiler(profiling_hook=profiling_hook, output_root=json_summary_folder)
458470

@@ -580,7 +592,9 @@ def load_properties(filename):
580592
return myvars
581593

582594
if __name__ == "__main__":
583-
parser = parser = argparse.ArgumentParser()
595+
parser = argparse.ArgumentParser()
596+
# argument group for query filtering
597+
query_filter_group = parser.add_mutually_exclusive_group(required=False)
584598
parser.add_argument('input_prefix',
585599
help='text to prepend to every input file path (e.g., "hdfs:///ds-generated-data"). ' +
586600
'If --hive or if input_format is "iceberg", this argument will be regarded as the value of property ' +
@@ -632,13 +646,6 @@ def load_properties(filename):
632646
'driver node/pod cannot be accessed easily. User needs to add essential extra ' +
633647
'jars and configurations to access different cloud storage systems. ' +
634648
'e.g. s3, gs etc.')
635-
parser.add_argument('--sub_queries',
636-
type=lambda s: [x.strip() for x in s.split(',')],
637-
help='comma separated list of queries to run. If not specified, all queries ' +
638-
'in the stream file will be run. e.g. "query1,query2,query3". Note, use ' +
639-
'"_part1" and "_part2" suffix for the following query names: ' +
640-
'query14, query23, query24, query39. e.g. query14_part1, query39_part2. '
641-
'Regex patterns are also supported to select multiple queries. e.g. "query1,query2,query14*"')
642649
parser.add_argument('--allow_failure',
643650
action='store_true',
644651
help='Do not exit with non zero when any query failed or any task failed')
@@ -667,6 +674,19 @@ def load_properties(filename):
667674
help='Skip the execution of the queries. This can be used in conjunction with ' +
668675
'--save_plan_path to only save the execution plans without running the queries.' +
669676
'Note that "spark.sql.adaptive.enabled" should be set to false to get GPU physical plans.')
677+
query_filter_group.add_argument('--sub_queries',
678+
type=lambda s: [x.strip() for x in s.split(',')],
679+
help='comma separated list of queries to run. If this is specified, sub_query_patterns should be empty. ' +
680+
'If both sub_queries and sub_query_patterns are not specified, all queries ' +
681+
'in the stream file will be run. Note, use "_part1" and "_part2" suffix for the following query names: ' +
682+
'query14, query23, query24, and query39. Ex) "query1,query2,query14_part1,query39_part2"')
683+
query_filter_group.add_argument('--sub_query_patterns',
684+
type=lambda s: [x.strip() for x in s.split(',')],
685+
help='comma separated list of query patterns to run in regex. If this is specified, sub_queries should be empty. ' +
686+
'If both sub_queries and sub_query_patterns are not specified, all queries ' +
687+
'in the stream file will be run. ' +
688+
'For example, query1 will run all queries starting with "query1", ' +
689+
'and "^query1$,query(2|3)_part1" will run query1, query2_part1, and query3_part1.')
670690
args = parser.parse_args()
671691
query_dict = gen_sql_from_stream(args.query_stream_file)
672692
run_query_stream(args.input_prefix,
@@ -675,6 +695,7 @@ def load_properties(filename):
675695
args.time_log,
676696
args.extra_time_log,
677697
args.sub_queries,
698+
args.sub_query_patterns,
678699
args.warmup_iterations,
679700
args.iterations,
680701
args.plan_types,

0 commit comments

Comments
 (0)