Skip to content

OrdinalEncoder is not converting correctly when min_frequency is set #1215

@chapman73

Description

@chapman73

As per the title. Code to reproduce is below:

import random

import numpy as np
import pandas as pd
from onnxruntime import InferenceSession
from skl2onnx import to_onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder

k = 100
sample_data = random.sample((list(range(1, 10)) + [np.nan]) * 1000, k=100)


samples = pd.DataFrame(
    {
        "sample_data": np.random.choice(range(100), size=k).tolist(),
        "target": np.random.choice([0, 1], size=k).tolist(),
    }
)

samples["sample_data"] = samples["sample_data"].astype("float32")


col = "sample_data"
X = samples[[col]]
y = samples["target"]
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size=0.7, random_state=42, shuffle=True
)
X_val.dropna(subset=[col], inplace=True)

params = [{
    "handle_unknown": "use_encoded_value",
    "unknown_value": -1,
    "min_frequency": 5,
}, {
    "handle_unknown": "use_encoded_value",
    "unknown_value": -1,
}]


def test_model(params):
    model = OrdinalEncoder(**params)
    model.fit(X_train.values.astype(np.float32), y_train)


    model_onnx = to_onnx(
        model, initial_types=[("X", FloatTensorType([None, 1]))], target_opset=17
    )


    session = InferenceSession(model_onnx.SerializeToString())

    output = session.run(None, {"X": samples[[col]].values.astype(np.float32)})[0]
    print(output.flatten())
    # show which rows differ
    diff = np.abs(output - model.transform(samples[[col]].values))
    print(
        "With params", params,
        "There are",
        len(diff[diff > 0.0001]),
        "rows that differ from",
        len(samples[[col]].values),
    )


test_model(params[0])
test_model(params[1])

Sample output -- as you can see the output is the same even though min_frequency is set in one of the tests. This suggests that the parameter is not being respected.

$ python -i ordinal_encoder_only.py
[21. 17. 24. 11. -1. 21. 32. 47. 14. -1. 47. 44. 46. 47.  2. 33. 45. 20.
 -1.  4. 39. 30. 45. 23. 15. 43. -1.  3. 44. 21. 15. 45. 29. 35. 33. 42.
  7.  6. 35. 18. 29. 33.  6. 26.  4. -1. 20.  8.  1. 25.  0. 37. 39. 44.
 20. -1. 25. 12. 40. 35. 27. 19. 44. 21. 22.  5. 18. 43. 33. -1. 40. 31.
 -1. -1. 22. 29. -1.  3. 36. 38. 42. 34. 28. -1. 41. 46. 13. 28. -1. 31.
  0.  9. 10. 45. 16. 13.  9. 11. 26. 26.]
With params {'handle_unknown': 'use_encoded_value', 'unknown_value': -1, 'min_frequency': 5} There are 86 rows that differ from 100
[21. 17. 24. 11. -1. 21. 32. 47. 14. -1. 47. 44. 46. 47.  2. 33. 45. 20.
 -1.  4. 39. 30. 45. 23. 15. 43. -1.  3. 44. 21. 15. 45. 29. 35. 33. 42.
  7.  6. 35. 18. 29. 33.  6. 26.  4. -1. 20.  8.  1. 25.  0. 37. 39. 44.
 20. -1. 25. 12. 40. 35. 27. 19. 44. 21. 22.  5. 18. 43. 33. -1. 40. 31.
 -1. -1. 22. 29. -1.  3. 36. 38. 42. 34. 28. -1. 41. 46. 13. 28. -1. 31.
  0.  9. 10. 45. 16. 13.  9. 11. 26. 26.]
With params {'handle_unknown': 'use_encoded_value', 'unknown_value': -1} There are 0 rows that differ from 100

Any pointers to the best way to address this?

Thanks in advance.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions