Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions src/fklearn/training/transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,11 +354,8 @@ def apply_replacements(df: pd.DataFrame,
Default value to replace when original value is not present in the `vec` dict for the feature

"""
def column_categorizer(col: str) -> np.ndarray:
replaced = df[col].map(vec[col])
unseen = df[col].notnull() & replaced.isnull()
replaced[unseen] = replace_unseen
return replaced
def column_categorizer(col: str) -> pd.Series:
return df[col].map(lambda x: vec[col].get(x, replace_unseen), na_action='ignore')
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this faster than .apply? I don't know how map works under the hood, but if it implements a for loop in the backend for lambda function, than its just as bad as apply no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, I wasn't benchmarking against the original implementation, this takes about the same time (4.5s original, 3.8s this one). Do you have an example where the original is very slow? That may be a better case to benchmark.


categ_columns = {col: column_categorizer(col) for col in columns}
return df.assign(**categ_columns)
Expand Down
10 changes: 5 additions & 5 deletions tests/training/test_transformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def test_count_categorizer():
expected_output_test = pd.DataFrame(
{
"feat1_num": [2, 20, 200, 2000],
"feat2_cat": [3.0, 1.0, 1.0, 1.0], # replace unseen vars with constant (1)
"feat2_cat": [3, 1, 1, 1], # replace unseen vars with constant (1)
"feat3_cat": [nan, nan, 3, 3],
}
)
Expand Down Expand Up @@ -537,10 +537,10 @@ def test_label_categorizer():
{
"feat1_num": [2, 20, 200, 2000],
"feat2_cat": [
0.0,
1.0,
1.0,
-99.0,
0,
1,
1,
-99,
], # replace unseen vars with constant (1)
"feat3_cat": [nan, nan, 0, 0],
}
Expand Down