@@ -363,17 +363,26 @@ def deduplicate(column_names):
363363 dedup_col_names = deduplicate (valid_col_names )
364364 return df .toDF (* dedup_col_names )
365365
366+
366367def get_query_subset (query_dict , subset ):
368+ """Get a subset of queries from query_dict.
369+ The subset is specified by a list of query names.
370+ """
371+ check_query_subset_exists (query_dict , subset )
372+ return dict ((k , query_dict [k ]) for k in subset )
373+
374+
375+ def get_query_subset_by_pattern (query_dict , patterns ):
367376 """Get a subset of queries from query_dict.
368377 The subset is specified by a list of regex patterns for the query name.
369378 """
370379 selected_queries = OrderedDict ()
371- for pattern in subset :
380+ for pattern in patterns :
372381 for query_name in query_dict .keys ():
373382 if re .match (pattern , query_name ):
374383 selected_queries [query_name ] = query_dict [query_name ]
375384 if not selected_queries :
376- msg = f"No query matched the specified subset patterns: { subset } "
385+ msg = f"No query matched the specified subset patterns: { patterns } "
377386 raise Exception (msg )
378387 return selected_queries
379388
@@ -383,6 +392,7 @@ def run_query_stream(input_prefix,
383392 time_log_output_path ,
384393 extra_time_log_output_path ,
385394 sub_queries ,
395+ sub_query_patterns ,
386396 warmup_iterations ,
387397 iterations ,
388398 plan_types ,
@@ -452,7 +462,9 @@ def run_query_stream(input_prefix,
452462 check_json_summary_folder (json_summary_folder )
453463 if sub_queries :
454464 query_dict = get_query_subset (query_dict , sub_queries )
455-
465+ if sub_query_patterns :
466+ query_dict = get_query_subset_by_pattern (query_dict , sub_query_patterns )
467+
456468 # Setup profiler
457469 profiler = Profiler (profiling_hook = profiling_hook , output_root = json_summary_folder )
458470
@@ -580,7 +592,9 @@ def load_properties(filename):
580592 return myvars
581593
582594if __name__ == "__main__" :
583- parser = parser = argparse .ArgumentParser ()
595+ parser = argparse .ArgumentParser ()
596+ # argument group for query filtering
597+ query_filter_group = parser .add_mutually_exclusive_group (required = False )
584598 parser .add_argument ('input_prefix' ,
585599 help = 'text to prepend to every input file path (e.g., "hdfs:///ds-generated-data"). ' +
586600 'If --hive or if input_format is "iceberg", this argument will be regarded as the value of property ' +
@@ -632,13 +646,6 @@ def load_properties(filename):
632646 'driver node/pod cannot be accessed easily. User needs to add essential extra ' +
633647 'jars and configurations to access different cloud storage systems. ' +
634648 'e.g. s3, gs etc.' )
635- parser .add_argument ('--sub_queries' ,
636- type = lambda s : [x .strip () for x in s .split (',' )],
637- help = 'comma separated list of queries to run. If not specified, all queries ' +
638- 'in the stream file will be run. e.g. "query1,query2,query3". Note, use ' +
639- '"_part1" and "_part2" suffix for the following query names: ' +
640- 'query14, query23, query24, query39. e.g. query14_part1, query39_part2. '
641- 'Regex patterns are also supported to select multiple queries. e.g. "query1,query2,query14*"' )
642649 parser .add_argument ('--allow_failure' ,
643650 action = 'store_true' ,
644651 help = 'Do not exit with non zero when any query failed or any task failed' )
@@ -667,6 +674,19 @@ def load_properties(filename):
667674 help = 'Skip the execution of the queries. This can be used in conjunction with ' +
668675 '--save_plan_path to only save the execution plans without running the queries.' +
669676 'Note that "spark.sql.adaptive.enabled" should be set to false to get GPU physical plans.' )
677+ query_filter_group .add_argument ('--sub_queries' ,
678+ type = lambda s : [x .strip () for x in s .split (',' )],
679+ help = 'comma separated list of queries to run. If this is specified, sub_query_patterns should be empty. ' +
680+ 'If both sub_queries and sub_query_patterns are not specified, all queries ' +
681+ 'in the stream file will be run. Note, use "_part1" and "_part2" suffix for the following query names: ' +
682+ 'query14, query23, query24, and query39. Ex) "query1,query2,query14_part1,query39_part2"' )
683+ query_filter_group .add_argument ('--sub_query_patterns' ,
684+ type = lambda s : [x .strip () for x in s .split (',' )],
685+ help = 'comma separated list of query patterns to run in regex. If this is specified, sub_queries should be empty. ' +
686+ 'If both sub_queries and sub_query_patterns are not specified, all queries ' +
687+ 'in the stream file will be run. ' +
688+ 'For example, query1 will run all queries starting with "query1", ' +
689+ 'and "^query1$,query(2|3)_part1" will run query1, query2_part1, and query3_part1.' )
670690 args = parser .parse_args ()
671691 query_dict = gen_sql_from_stream (args .query_stream_file )
672692 run_query_stream (args .input_prefix ,
@@ -675,6 +695,7 @@ def load_properties(filename):
675695 args .time_log ,
676696 args .extra_time_log ,
677697 args .sub_queries ,
698+ args .sub_query_patterns ,
678699 args .warmup_iterations ,
679700 args .iterations ,
680701 args .plan_types ,
0 commit comments