Skip to content

Commit 2462f35

Browse files
dpetrovbxadupre
andauthored
Enhance OrdinalEncoder conversion to handle infrequent categories (#1195)
* Enhance OrdinalEncoder conversion to handle infrequent categories - Added logic to check if infrequent categories are enabled in the OrdinalEncoder. - Introduced handling for `infrequent_categories_` to adjust `values_int64s` accordingly. - Updated conversion process to account for `max_categories` or `min_frequency` by modifying the attribute values for infrequent categories. Signed-off-by: Danil Petrov <[email protected]> * Refactor handling of infrequent categories in OrdinalEncoder conversion - Replaced `current_infrequent_categories_` with `default_to_infrequent_mappings` for clarity. - Updated logic to handle `default_to_infrequent_mappings` when encoding missing values. - Simplified the assignment of `attrs["values_int64s"]` by using `default_to_infrequent_mappings` where applicable. - Ensured consistent handling of `max_categories` or `min_frequency` scenarios. Signed-off-by: Danil Petrov <[email protected]> * Fix linting Signed-off-by: Danil Petrov <[email protected]> * Enable conversion of OrdinalEncoder with max_categories and min_frequency - Added `max_categories_support` function to check scikit-learn version >= 1.3 for `max_categories` and `min_frequency` support in `OrdinalEncoder`. - Updated tests to skip tests if `max_categories` and `min_frequency` are not supported. - Added a check for `_infrequent_enabled` attribute before accessing it to ensure compatibility with older versions of scikit-learn. Signed-off-by: Danil Petrov <[email protected]> * Improve infrequent category handling and missing value encoding in OrdinalEncoder conversion - Modified the condition for checking `_infrequent_enabled` to improve readability. - Ensured correct concatenation of `encoded_missing_value` with `values_int64s` when `default_to_infrequent_mappings` is not None. - Added a test case `SklearnOrdinalEncoderCatList` to verify the conversion of `OrdinalEncoder` with a list of categories. - Updated `dump_data_and_model` call in `SklearnOrdinalEncoderCatList` test for better readability. Signed-off-by: Danil Petrov <[email protected]> --------- Signed-off-by: Danil Petrov <[email protected]> Co-authored-by: Xavier Dupré <[email protected]>
1 parent 3b0b025 commit 2462f35

File tree

2 files changed

+118
-4
lines changed

2 files changed

+118
-4
lines changed

skl2onnx/operator_converters/ordinal_encoder.py

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ def convert_sklearn_ordinal_encoder(
3434
if len(categories) == 0:
3535
continue
3636

37+
if (
38+
hasattr(ordinal_op, "_infrequent_enabled")
39+
and ordinal_op._infrequent_enabled
40+
):
41+
default_to_infrequent_mappings = ordinal_op._default_to_infrequent_mappings[
42+
input_idx
43+
]
44+
else:
45+
default_to_infrequent_mappings = None
46+
3747
current_input = operator.inputs[input_idx]
3848
if current_input.get_second_dimension() == 1:
3949
feature_column = current_input
@@ -113,11 +123,30 @@ def convert_sklearn_ordinal_encoder(
113123
encoded_missing_value = np.array(
114124
[int(ordinal_op.encoded_missing_value)]
115125
).astype(np.int64)
116-
attrs["values_int64s"] = np.concatenate(
117-
(np.arange(len(categories) - 1).astype(np.int64), encoded_missing_value)
118-
)
126+
127+
# handle max_categories or min_frequency
128+
if default_to_infrequent_mappings is not None:
129+
attrs["values_int64s"] = np.concatenate(
130+
(
131+
np.array(default_to_infrequent_mappings, dtype=np.int64),
132+
encoded_missing_value,
133+
)
134+
)
135+
else:
136+
attrs["values_int64s"] = np.concatenate(
137+
(
138+
np.arange(len(categories) - 1).astype(np.int64),
139+
encoded_missing_value,
140+
)
141+
)
119142
else:
120-
attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
143+
# handle max_categories or min_frequency
144+
if default_to_infrequent_mappings is not None:
145+
attrs["values_int64s"] = np.array(
146+
default_to_infrequent_mappings, dtype=np.int64
147+
)
148+
else:
149+
attrs["values_int64s"] = np.arange(len(categories)).astype(np.int64)
121150

122151
if default_value:
123152
attrs["default_int64"] = default_value

tests/test_sklearn_ordinal_encoder.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ def set_output_support():
4040
return pv.Version(vers) >= pv.Version("1.2")
4141

4242

43+
def max_categories_support():
44+
vers = ".".join(sklearn_version.split(".")[:2])
45+
return pv.Version(vers) >= pv.Version("1.3")
46+
47+
4348
class TestSklearnOrdinalEncoderConverter(unittest.TestCase):
4449
@unittest.skipIf(
4550
not ordinal_encoder_support(),
@@ -379,6 +384,86 @@ def test_ordinal_encoder_pipeline_string_int64(self):
379384
)
380385
assert_almost_equal(expected, got[0].ravel())
381386

387+
@unittest.skipIf(
388+
not max_categories_support(),
389+
reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3",
390+
)
391+
def test_model_ordinal_encoder_max_categories(self):
392+
from onnxruntime import InferenceSession
393+
394+
model = OrdinalEncoder(max_categories=4)
395+
data = np.array(
396+
[["a"], ["b"], ["c"], ["d"], ["a"], ["b"], ["c"], ["e"]], dtype=np.object_
397+
)
398+
399+
expected = model.fit_transform(data)
400+
401+
model_onnx = convert_sklearn(
402+
model,
403+
"scikit-learn ordinal encoder",
404+
[("input", StringTensorType([None, 1]))],
405+
target_opset=TARGET_OPSET,
406+
)
407+
self.assertIsNotNone(model_onnx)
408+
dump_data_and_model(
409+
data,
410+
model,
411+
model_onnx,
412+
basename="SklearnOrdinalEncoderMaxCategories",
413+
)
414+
415+
sess = InferenceSession(
416+
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
417+
)
418+
got = sess.run(
419+
None,
420+
{
421+
"input": data,
422+
},
423+
)
424+
425+
assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
426+
427+
@unittest.skipIf(
428+
not max_categories_support(),
429+
reason="OrdinalEncoder supports max_categories and min_frequencey since 1.3",
430+
)
431+
def test_model_ordinal_encoder_min_frequency(self):
432+
from onnxruntime import InferenceSession
433+
434+
model = OrdinalEncoder(min_frequency=2)
435+
data = np.array(
436+
[["a"], ["b"], ["c"], ["d"], ["a"], ["b"], ["c"], ["e"]], dtype=np.object_
437+
)
438+
439+
expected = model.fit_transform(data)
440+
441+
model_onnx = convert_sklearn(
442+
model,
443+
"scikit-learn ordinal encoder",
444+
[("input", StringTensorType([None, 1]))],
445+
target_opset=TARGET_OPSET,
446+
)
447+
self.assertIsNotNone(model_onnx)
448+
dump_data_and_model(
449+
data,
450+
model,
451+
model_onnx,
452+
basename="SklearnOrdinalEncoderMinFrequency",
453+
)
454+
455+
sess = InferenceSession(
456+
model_onnx.SerializeToString(), providers=["CPUExecutionProvider"]
457+
)
458+
got = sess.run(
459+
None,
460+
{
461+
"input": data,
462+
},
463+
)
464+
465+
assert_almost_equal(expected.reshape(-1), got[0].reshape(-1))
466+
382467

383468
if __name__ == "__main__":
384469
unittest.main(verbosity=2)

0 commit comments

Comments
 (0)