From bdfe7a28ca95569b3be60b8ad59346a9e59d9a1c Mon Sep 17 00:00:00 2001 From: chapman73 Date: Tue, 14 Oct 2025 17:10:07 +1100 Subject: [PATCH] Enhance OrdinalEncoder conversion to exclude infrequent categories and add tests for min_frequency support in multi-column scenarios. --- .../operator_converters/ordinal_encoder.py | 7 +- tests/test_sklearn_ordinal_encoder.py | 69 +++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/skl2onnx/operator_converters/ordinal_encoder.py b/skl2onnx/operator_converters/ordinal_encoder.py index 1a6021e6e..326a4f3ab 100644 --- a/skl2onnx/operator_converters/ordinal_encoder.py +++ b/skl2onnx/operator_converters/ordinal_encoder.py @@ -42,7 +42,12 @@ def convert_sklearn_ordinal_encoder( ) ) - for categories in ordinal_op.categories_: + ordinal_categories = ordinal_op.categories_ + if hasattr(ordinal_op, "infrequent_categories_") and ordinal_op.infrequent_categories_: + # exclude infrequent categories if present + ordinal_categories = [categories for categories in ordinal_categories if categories not in ordinal_op.infrequent_categories_] + + for categories in ordinal_categories: if len(categories) == 0: continue diff --git a/tests/test_sklearn_ordinal_encoder.py b/tests/test_sklearn_ordinal_encoder.py index 1bad20648..dd5aec7cc 100644 --- a/tests/test_sklearn_ordinal_encoder.py +++ b/tests/test_sklearn_ordinal_encoder.py @@ -504,6 +504,75 @@ def test_model_ordinal_encoder_unknown_value_nan(self): assert_almost_equal(expected.reshape(-1), got[0].reshape(-1)) + @unittest.skipIf( + not max_categories_support(), + reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3", + ) + def test_model_ordinal_encoder_min_frequency_multi_column(self): + from onnxruntime import InferenceSession + + model = OrdinalEncoder(min_frequency=3, handle_unknown="use_encoded_value", unknown_value=-1) + # First column: 'a' appears 4 times (frequent), 'b' 2 times (infrequent), 'c' 1 time (infrequent) + # Second column: 'x' appears 4 times (frequent), 'y' 2 times (infrequent), 'z' 1 time (infrequent) + data = np.array( + [ + ["a", "x"], + ["a", "x"], + ["a", "x"], + ["a", "x"], + ["b", "y"], + ["b", "y"], + ["c", "z"], + ], + dtype=np.object_, + ) + test_data = np.array( + [ + ["a", "x"], # frequent in both columns + ["b", "y"], # infrequent in both columns + ["c", "z"], # infrequent in both columns + ], + dtype=np.object_, + ) + + expected = model.fit_transform(data) + expected_test = model.transform(test_data) + + model_onnx = convert_sklearn( + model, + "scikit-learn ordinal encoder", + [("input", StringTensorType([None, 2]))], + target_opset=TARGET_OPSET, + ) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + data, + model, + model_onnx, + basename="SklearnOrdinalEncoderMinFrequencyMultiCol", + ) + + sess = InferenceSession( + model_onnx.SerializeToString(), providers=["CPUExecutionProvider"] + ) + got = sess.run( + None, + { + "input": data, + }, + ) + assert_almost_equal(expected.reshape(-1), got[0].reshape(-1)) + + # Test with test data + got_test = sess.run( + None, + { + "input": test_data, + }, + ) + assert_almost_equal(expected_test.reshape(-1), got_test[0].reshape(-1)) + + if __name__ == "__main__": unittest.main(verbosity=2)