Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion skl2onnx/operator_converters/ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@ def convert_sklearn_ordinal_encoder(
)
)

for categories in ordinal_op.categories_:
ordinal_categories = ordinal_op.categories_
if hasattr(ordinal_op, "infrequent_categories_") and ordinal_op.infrequent_categories_:
# exclude infrequent categories if present
ordinal_categories = [categories for categories in ordinal_categories if categories not in ordinal_op.infrequent_categories_]

for categories in ordinal_categories:
if len(categories) == 0:
continue

Expand Down
69 changes: 69 additions & 0 deletions tests/test_sklearn_ordinal_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,75 @@ def test_model_ordinal_encoder_unknown_value_nan(self):

assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))

@unittest.skipIf(
not max_categories_support(),
reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3",
)
def test_model_ordinal_encoder_min_frequency_multi_column(self):
from onnxruntime import InferenceSession

model = OrdinalEncoder(min_frequency=3, handle_unknown="use_encoded_value", unknown_value=-1)
# First column: 'a' appears 4 times (frequent), 'b' 2 times (infrequent), 'c' 1 time (infrequent)
# Second column: 'x' appears 4 times (frequent), 'y' 2 times (infrequent), 'z' 1 time (infrequent)
data = np.array(
[
["a", "x"],
["a", "x"],
["a", "x"],
["a", "x"],
["b", "y"],
["b", "y"],
["c", "z"],
],
dtype=np.object_,
)
test_data = np.array(
[
["a", "x"], # frequent in both columns
["b", "y"], # infrequent in both columns
["c", "z"], # infrequent in both columns
],
dtype=np.object_,
)

expected = model.fit_transform(data)
expected_test = model.transform(test_data)

model_onnx = convert_sklearn(
model,
"scikit-learn ordinal encoder",
[("input", StringTensorType([None, 2]))],
target_opset=TARGET_OPSET,
)
self.assertIsNotNone(model_onnx)
dump_data_and_model(
data,
model,
model_onnx,
basename="SklearnOrdinalEncoderMinFrequencyMultiCol",
)

sess = InferenceSession(
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
)
got = sess.run(
None,
{
"input": data,
},
)
assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))

# Test with test data
got_test = sess.run(
None,
{
"input": test_data,
},
)
assert_almost_equal(expected_test.reshape(-1), got_test[0].reshape(-1))



if __name__ == "__main__":
unittest.main(verbosity=2)
Loading