@@ -26,8 +26,20 @@ def convert_sklearn_ordinal_encoder(
2626 dimension_idx = 0
2727
2828 # handle the 'handle_unknown=use_encoded_value' case
29+ use_float = (
30+ False
31+ if ordinal_op .unknown_value is None
32+ else isinstance (ordinal_op .unknown_value , float )
33+ or np .isnan (ordinal_op .unknown_value )
34+ )
2935 default_value = (
30- None if ordinal_op .handle_unknown == "error" else int (ordinal_op .unknown_value )
36+ None
37+ if ordinal_op .handle_unknown == "error"
38+ else (
39+ float (ordinal_op .unknown_value )
40+ if use_float
41+ else int (ordinal_op .unknown_value )
42+ )
3143 )
3244
3345 for categories in ordinal_op .categories_ :
@@ -113,43 +125,45 @@ def convert_sklearn_ordinal_encoder(
113125 )
114126
115127 # hanlde encoded_missing_value
128+ key = "values_floats" if use_float else "values_int64s"
129+ dtype = np .float32 if use_float else np .int64
116130 if not np .isnan (ordinal_op .encoded_missing_value ) and (
117131 isinstance (categories [- 1 ], float ) and np .isnan (categories [- 1 ])
118132 ):
119133 # sklearn always places np.nan as the last entry
120- # in its cathegories if it was in the training data
134+ # in its categories if it was in the training data
121135 # => we simply add the 'ordinal_op.encoded_missing_value'
122136 # as our last entry in 'values_int64s' if it was in the training data
123137 encoded_missing_value = np .array (
124138 [int (ordinal_op .encoded_missing_value )]
125- ).astype (np . int64 )
139+ ).astype (dtype )
126140
127141 # handle max_categories or min_frequency
128142 if default_to_infrequent_mappings is not None :
129- attrs ["values_int64s" ] = np .concatenate (
143+ attrs [key ] = np .concatenate (
130144 (
131- np .array (default_to_infrequent_mappings , dtype = np . int64 ),
145+ np .array (default_to_infrequent_mappings , dtype = dtype ),
132146 encoded_missing_value ,
133147 )
134148 )
135149 else :
136- attrs ["values_int64s" ] = np .concatenate (
150+ attrs [key ] = np .concatenate (
137151 (
138- np .arange (len (categories ) - 1 ).astype (np . int64 ),
152+ np .arange (len (categories ) - 1 ).astype (dtype ),
139153 encoded_missing_value ,
140154 )
141155 )
142156 else :
143157 # handle max_categories or min_frequency
144158 if default_to_infrequent_mappings is not None :
145- attrs ["values_int64s" ] = np .array (
146- default_to_infrequent_mappings , dtype = np .int64
147- )
159+ attrs [key ] = np .array (default_to_infrequent_mappings , dtype = dtype )
148160 else :
149- attrs ["values_int64s" ] = np .arange (len (categories )).astype (np . int64 )
161+ attrs [key ] = np .arange (len (categories )).astype (dtype )
150162
151- if default_value :
152- attrs ["default_int64" ] = default_value
163+ if default_value or (
164+ isinstance (default_value , float ) and np .isnan (default_value )
165+ ):
166+ attrs ["default_float" if use_float else "default_int64" ] = default_value
153167
154168 result .append (scope .get_unique_variable_name ("ordinal_output" ))
155169 label_encoder_output = scope .get_unique_variable_name ("label_encoder" )
0 commit comments