Skip to content

Commit 164985c

Browse files
FelipeAdachifelipe207
andauthored
ndcg refactor (#1481)
## Description This PR: - Adds ndcg value expecations to the tests, not just counts - Changes convert_non_numeric=True to be used for string columns (integers, floats in scores and bools should use the default convert_non_numeric=False ) - adds score_column support: when a score is available, like in [this example](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html), one should use score_column and target_column , and not prediction_column (see 2 last tests on the sklearn examples) - Changes the logic to generate target column values to make it compatible with all scenarios - Fixes prediction and ideal relevance calculation for non numeric case - Handles DivisionbyZero edge case when idcg=0 (ndcg set to 1 if no relevant documents exist) - If K is not passed, metrics will be calculated according to predictions cols's length (and metrics named accordingly - k is no longer omitted in the names when k is None) For the Numeric case: - If predictions+target or scores+target columns are both provided, they need to be of same length. - If prediction_column is provided with target column, prediction col contains the rank of suggested items, starting with 1 - If only prediction column is provided, it is assumed that the order encodes the ranks of recommendations: first item in the list is the first recommendation. The value in the list encodes the relevance score - [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md). --------- Co-authored-by: felipe207 <[email protected]>
1 parent 0c72856 commit 164985c

File tree

2 files changed

+88
-64
lines changed

2 files changed

+88
-64
lines changed

python/tests/experimental/api/test_logger.py

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -22,37 +22,37 @@ def test_log_batch_ranking_metrics_single_simple():
2222
pandas_summary = result.view().to_pandas()
2323

2424
column_names = [
25-
"mean_average_precision",
26-
"accuracy",
25+
"mean_average_precision_k_3",
26+
"accuracy_k_3",
2727
"mean_reciprocal_rank",
28-
"precision",
29-
"recall",
28+
"precision_k_3",
29+
"recall_k_3",
3030
"top_rank",
31-
"average_precision",
32-
"norm_dis_cumul_gain",
31+
"average_precision_k_3",
32+
"norm_dis_cumul_gain_k_3",
3333
]
3434
for col in column_names:
3535
assert col in pandas_summary.index
36-
assert pandas_summary.loc["mean_average_precision", "counts/n"] == 1
37-
assert pandas_summary.loc["accuracy", "counts/n"] == 1
36+
assert pandas_summary.loc["mean_average_precision_k_3", "counts/n"] == 1
37+
assert pandas_summary.loc["accuracy_k_3", "counts/n"] == 1
3838
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
39-
assert pandas_summary.loc["precision", "counts/n"] == 4
40-
assert pandas_summary.loc["recall", "counts/n"] == 4
39+
assert pandas_summary.loc["precision_k_3", "counts/n"] == 4
40+
assert pandas_summary.loc["recall_k_3", "counts/n"] == 4
4141
assert pandas_summary.loc["top_rank", "counts/n"] == 4
42-
assert pandas_summary.loc["average_precision", "counts/n"] == 4
43-
assert pandas_summary.loc["norm_dis_cumul_gain", "counts/n"] == 1
44-
assert pandas_summary.loc["average_precision", "counts/n"] == 4
45-
assert pandas_summary.loc["norm_dis_cumul_gain", "counts/n"] == 1
42+
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
43+
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
44+
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
45+
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
46+
# ndcg = [1, 0, 0.63, 0.5]
47+
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_3", "distribution/mean"], 0.53273, abs_tol=0.00001)
4648

4749

4850
def test_log_batch_ranking_metrics_binary_simple():
4951
binary_df = pd.DataFrame(
5052
{"raw_predictions": [[True, False, True], [False, False, False], [True, True, False], [False, True, False]]}
5153
)
5254

53-
result = log_batch_ranking_metrics(
54-
data=binary_df, prediction_column="raw_predictions", k=2, convert_non_numeric=True
55-
)
55+
result = log_batch_ranking_metrics(data=binary_df, prediction_column="raw_predictions", k=2)
5656
pandas_summary = result.view().to_pandas()
5757

5858
k = 2
@@ -76,6 +76,8 @@ def test_log_batch_ranking_metrics_binary_simple():
7676
assert pandas_summary.loc["top_rank", "counts/n"] == 4
7777
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
7878
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
79+
# ndcg@2 = [0.613147, 1.0, 1.0, 0.63093]
80+
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "distribution/mean"], 0.81101, abs_tol=0.00001)
7981

8082

8183
def test_log_batch_ranking_metrics_multiple_simple():
@@ -121,16 +123,14 @@ def test_log_batch_ranking_metrics_multiple_simple():
121123
assert pandas_summary.loc["top_rank", "counts/n"] == 4
122124
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
123125
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
124-
125-
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.76244, abs_tol=0.00001)
126+
# ndcg@4 = [0.9197, 0.0, 1.0, 0.386853]
127+
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.57664, abs_tol=0.00001)
126128

127129

128130
def test_log_batch_ranking_metrics_default_target():
129131
multiple_df = pd.DataFrame({"raw_predictions": [[3, 2, 3, 0, 1, 2, 3, 2]]})
130132

131-
result = log_batch_ranking_metrics(
132-
data=multiple_df, prediction_column="raw_predictions", k=3, convert_non_numeric=True
133-
)
133+
result = log_batch_ranking_metrics(data=multiple_df, prediction_column="raw_predictions", k=3)
134134
pandas_summary = result.view().to_pandas()
135135

136136
k = 3
@@ -154,11 +154,13 @@ def test_log_batch_ranking_metrics_default_target():
154154
assert pandas_summary.loc["top_rank", "counts/n"] == 1
155155
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 1
156156
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
157+
# ndcg@3 = [0.9013]
158+
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.90130, abs_tol=0.00001)
157159

158160

159161
def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():
160162
# From https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Example
161-
ranking_df = pd.DataFrame({"targets": [[3, 2, 3, 0, 1, 2, 3, 2]], "predictions": [[7, 6, 5, 4, 3, 2, 1, 0]]})
163+
ranking_df = pd.DataFrame({"targets": [[1, 0, 2, 3, 3, 2, 2, 3]], "predictions": [[5, 4, 2, 1, 7, 8, 6, 3]]})
162164

163165
result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets", k=6)
164166
pandas_summary = result.view().to_pandas()
@@ -168,19 +170,19 @@ def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():
168170

169171
def test_log_batch_ranking_metrics_ranking_ndcg_sklearn():
170172
# From https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html
171-
ranking_df = pd.DataFrame({"predictions": [[0.1, 0.2, 0.3, 4, 70]], "targets": [[10, 0, 0, 1, 5]]})
173+
ranking_df = pd.DataFrame({"scores": [[0.1, 0.2, 0.3, 4, 70]], "true_relevance": [[10, 0, 0, 1, 5]]})
172174

173-
result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets")
175+
result = log_batch_ranking_metrics(data=ranking_df, score_column="scores", target_column="true_relevance")
174176
pandas_summary = result.view().to_pandas()
175177

176-
assert isclose(pandas_summary.loc["norm_dis_cumul_gain", "distribution/median"], 0.69569, abs_tol=0.00001)
178+
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_5", "distribution/median"], 0.69569, abs_tol=0.00001)
177179

178180

179181
def test_log_batch_ranking_metrics_ranking_ndcg_withk_sklearn():
180182
# From https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html
181-
ranking_df = pd.DataFrame({"predictions": [[0.05, 1.1, 1.0, 0.5, 0.0]], "targets": [[10, 0, 0, 1, 5]]})
183+
ranking_df = pd.DataFrame({"scores": [[0.05, 1.1, 1.0, 0.5, 0.0]], "true_relevance": [[10, 0, 0, 1, 5]]})
182184

183-
result = log_batch_ranking_metrics(data=ranking_df, prediction_column="predictions", target_column="targets", k=4)
185+
result = log_batch_ranking_metrics(data=ranking_df, score_column="scores", target_column="true_relevance", k=4)
184186
pandas_summary = result.view().to_pandas()
185187

186188
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_4", "distribution/median"], 0.35202, abs_tol=0.00001)

python/whylogs/experimental/api/logger/__init__.py

Lines changed: 58 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,16 @@
1010
diagnostic_logger = logging.getLogger(__name__)
1111

1212

13+
def _convert_to_int_if_bool(data: pd.core.frame.DataFrame, *columns: str) -> pd.core.frame.DataFrame:
14+
for col in columns:
15+
if all(isinstance(x, bool) for x in data[col]):
16+
data[col] = data[col].apply(lambda x: 1 if x else 0)
17+
return data
18+
19+
1320
def log_batch_ranking_metrics(
1421
data: pd.core.frame.DataFrame,
15-
prediction_column: str,
22+
prediction_column: Optional[str] = None,
1623
target_column: Optional[str] = None,
1724
score_column: Optional[str] = None,
1825
k: Optional[int] = None,
@@ -22,26 +29,42 @@ def log_batch_ranking_metrics(
2229
) -> ViewResultSet:
2330
formatted_data = data.copy(deep=True) # TODO: does this have to be deep?
2431

32+
if prediction_column is None:
33+
if score_column is not None and target_column is not None:
34+
prediction_column = "__predictions"
35+
36+
# Ties are not being handled here
37+
formatted_data[prediction_column] = formatted_data[score_column].apply(
38+
lambda row: list(np.argsort(np.argsort(-np.array(row))) + 1)
39+
)
40+
else:
41+
raise ValueError("Either prediction_column or score+target columns must be specified")
42+
2543
relevant_cols = [prediction_column]
44+
2645
if target_column is None:
46+
formatted_data = _convert_to_int_if_bool(formatted_data, prediction_column)
2747
target_column = "__targets"
28-
formatted_data[target_column] = formatted_data[prediction_column].apply(lambda x: list(range(len(x)))[::-1])
48+
# the relevances in predicitons are moved to targets, and predicitons contains the indices to the target list
49+
formatted_data[target_column] = formatted_data[prediction_column]
50+
formatted_data[prediction_column] = formatted_data[target_column].apply(
51+
lambda row: list(range(1, len(row) + 1))
52+
)
53+
2954
relevant_cols.append(target_column)
3055
if score_column is not None:
3156
relevant_cols.append(score_column)
32-
3357
for col in relevant_cols:
3458
if not formatted_data[col].apply(lambda x: type(x) == list).all():
3559
# wrapping in lists because at least one isn't a list
3660
# TODO: more error checking
3761
formatted_data[col] = formatted_data[col].apply(lambda x: [x])
38-
3962
_max_k = formatted_data[prediction_column].apply(len).max()
40-
63+
if not k:
64+
k = _max_k
4165
formatted_data["count_at_k"] = formatted_data[relevant_cols].apply(
4266
lambda row: sum([1 if pred_val in row[target_column] else 0 for pred_val in row[prediction_column][:k]]), axis=1
4367
)
44-
4568
formatted_data["count_all"] = formatted_data[relevant_cols].apply(
4669
lambda row: sum([1 if pred_val in row[target_column] else 0 for pred_val in row[prediction_column]]), axis=1
4770
)
@@ -54,12 +77,10 @@ def get_top_rank(row):
5477
return matches[0]
5578

5679
formatted_data["top_rank"] = formatted_data[relevant_cols].apply(get_top_rank, axis=1)
57-
5880
output_data = (formatted_data["count_at_k"] / (k if k else 1)).to_frame()
5981
output_data.columns = ["precision" + ("_k_" + str(k) if k else "")]
6082
output_data["recall" + ("_k_" + str(k) if k else "")] = formatted_data["count_at_k"] / formatted_data["count_all"]
6183
output_data["top_rank"] = formatted_data["top_rank"]
62-
6384
ki_dict: pd.DataFrame = None
6485
for ki in range(1, (k if k else _max_k) + 1):
6586
ki_result = (
@@ -76,41 +97,44 @@ def get_top_rank(row):
7697
ki_dict.columns = ["p@" + str(ki)]
7798
else:
7899
ki_dict["p@" + str(ki)] = ki_result
79-
80100
output_data["average_precision" + ("_k_" + str(k) if k else "")] = ki_dict.mean(axis=1)
81101

82-
def _convert_non_numeric(row_dict):
83-
return (
84-
[
85-
row_dict[target_column].index(pred_val) if pred_val in row_dict[target_column] else -1
86-
for pred_val in row_dict[prediction_column]
87-
],
88-
list(range(len(row_dict[prediction_column])))[::-1],
89-
)
90-
91-
if convert_non_numeric:
92-
formatted_data[[prediction_column, target_column]] = formatted_data.apply(
93-
_convert_non_numeric, result_type="expand", axis=1
94-
)
102+
def _calc_non_numeric_relevance(row_dict):
103+
prediction_relevance = []
104+
ideal_relevance = []
105+
for target_val in row_dict[prediction_column]:
106+
ideal_relevance.append(1 if target_val in row_dict[target_column] else 0)
107+
prediction_relevance.append(1 if target_val in row_dict[target_column] else 0)
108+
for target_val in row_dict[target_column]:
109+
if target_val not in row_dict[prediction_column]:
110+
ideal_relevance.append(1)
111+
return (prediction_relevance, sorted(ideal_relevance, reverse=True))
95112

96113
def _calculate_row_ndcg(row_dict, k):
97-
predicted_order = np.array(row_dict[prediction_column]).argsort()[::-1]
98-
target_order = np.array(row_dict[target_column]).argsort()[::-1]
99-
dcg_vals = [
100-
(rel / math.log(i + 2, 2)) for i, rel in enumerate(np.array(row_dict[target_column])[predicted_order][:k])
101-
]
102-
idcg_vals = [
103-
(rel / math.log(i + 2, 2)) for i, rel in enumerate(np.array(row_dict[target_column])[target_order][:k])
104-
]
114+
if not convert_non_numeric:
115+
dcg_vals = [
116+
rel / math.log2(pos + 1)
117+
for rel, pos in zip(row_dict[target_column], row_dict[prediction_column])
118+
if pos <= k
119+
]
120+
idcg_vals = [
121+
rel / math.log2(pos + 2) for pos, rel in enumerate(sorted(row_dict[target_column], reverse=True)[:k])
122+
]
123+
else:
124+
predicted_relevances, ideal_relevances = _calc_non_numeric_relevance(row_dict)
125+
dcg_vals = [(rel / math.log(i + 2, 2)) for i, rel in enumerate(predicted_relevances[:k])]
126+
idcg_vals = [(rel / math.log(i + 2, 2)) for i, rel in enumerate(ideal_relevances[:k])]
127+
if sum(idcg_vals) == 0:
128+
return 1 # if there is no relevant data, not much the recommender can do
105129
return sum(dcg_vals) / sum(idcg_vals)
106130

107-
formatted_data["norm_dis_cumul_gain_k_" + str(k)] = formatted_data.apply(_calculate_row_ndcg, args=(k,), axis=1)
108-
131+
formatted_data["norm_dis_cumul_gain" + ("_k_" + str(k) if k else "")] = formatted_data.apply(
132+
_calculate_row_ndcg, args=(k,), axis=1
133+
)
109134
mAP_at_k = ki_dict.mean()
110135
hit_ratio = formatted_data["count_at_k"].apply(lambda x: bool(x)).sum() / len(formatted_data)
111136
mrr = (1 / formatted_data["top_rank"]).replace([np.inf], np.nan).mean()
112-
ndcg = formatted_data["norm_dis_cumul_gain_k_" + str(k)].mean()
113-
137+
ndcg = formatted_data["norm_dis_cumul_gain" + ("_k_" + str(k) if k else "")].mean()
114138
result = log(pandas=output_data, schema=schema)
115139
result = result.merge(
116140
log(
@@ -123,8 +147,6 @@ def _calculate_row_ndcg(row_dict, k):
123147
schema=schema,
124148
)
125149
)
126-
127150
if log_full_data:
128151
result = result.merge(log(pandas=data, schema=schema))
129-
130152
return result

0 commit comments

Comments
 (0)