-
Notifications
You must be signed in to change notification settings - Fork 123
Open
Description
As per the title. Code to reproduce is below:
import random
import numpy as np
import pandas as pd
from onnxruntime import InferenceSession
from skl2onnx import to_onnx
from skl2onnx.common.data_types import FloatTensorType
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
k = 100
sample_data = random.sample((list(range(1, 10)) + [np.nan]) * 1000, k=100)
samples = pd.DataFrame(
{
"sample_data": np.random.choice(range(100), size=k).tolist(),
"target": np.random.choice([0, 1], size=k).tolist(),
}
)
samples["sample_data"] = samples["sample_data"].astype("float32")
col = "sample_data"
X = samples[[col]]
y = samples["target"]
X_train, X_val, y_train, y_val = train_test_split(
X, y, train_size=0.7, random_state=42, shuffle=True
)
X_val.dropna(subset=[col], inplace=True)
params = [{
"handle_unknown": "use_encoded_value",
"unknown_value": -1,
"min_frequency": 5,
}, {
"handle_unknown": "use_encoded_value",
"unknown_value": -1,
}]
def test_model(params):
model = OrdinalEncoder(**params)
model.fit(X_train.values.astype(np.float32), y_train)
model_onnx = to_onnx(
model, initial_types=[("X", FloatTensorType([None, 1]))], target_opset=17
)
session = InferenceSession(model_onnx.SerializeToString())
output = session.run(None, {"X": samples[[col]].values.astype(np.float32)})[0]
print(output.flatten())
# show which rows differ
diff = np.abs(output - model.transform(samples[[col]].values))
print(
"With params", params,
"There are",
len(diff[diff > 0.0001]),
"rows that differ from",
len(samples[[col]].values),
)
test_model(params[0])
test_model(params[1])Sample output -- as you can see the output is the same even though min_frequency is set in one of the tests. This suggests that the parameter is not being respected.
$ python -i ordinal_encoder_only.py
[21. 17. 24. 11. -1. 21. 32. 47. 14. -1. 47. 44. 46. 47. 2. 33. 45. 20.
-1. 4. 39. 30. 45. 23. 15. 43. -1. 3. 44. 21. 15. 45. 29. 35. 33. 42.
7. 6. 35. 18. 29. 33. 6. 26. 4. -1. 20. 8. 1. 25. 0. 37. 39. 44.
20. -1. 25. 12. 40. 35. 27. 19. 44. 21. 22. 5. 18. 43. 33. -1. 40. 31.
-1. -1. 22. 29. -1. 3. 36. 38. 42. 34. 28. -1. 41. 46. 13. 28. -1. 31.
0. 9. 10. 45. 16. 13. 9. 11. 26. 26.]
With params {'handle_unknown': 'use_encoded_value', 'unknown_value': -1, 'min_frequency': 5} There are 86 rows that differ from 100
[21. 17. 24. 11. -1. 21. 32. 47. 14. -1. 47. 44. 46. 47. 2. 33. 45. 20.
-1. 4. 39. 30. 45. 23. 15. 43. -1. 3. 44. 21. 15. 45. 29. 35. 33. 42.
7. 6. 35. 18. 29. 33. 6. 26. 4. -1. 20. 8. 1. 25. 0. 37. 39. 44.
20. -1. 25. 12. 40. 35. 27. 19. 44. 21. 22. 5. 18. 43. 33. -1. 40. 31.
-1. -1. 22. 29. -1. 3. 36. 38. 42. 34. 28. -1. 41. 46. 13. 28. -1. 31.
0. 9. 10. 45. 16. 13. 9. 11. 26. 26.]
With params {'handle_unknown': 'use_encoded_value', 'unknown_value': -1} There are 0 rows that differ from 100Any pointers to the best way to address this?
Thanks in advance.
kdu-cash
Metadata
Metadata
Assignees
Labels
No labels