Skip to content

Commit 538eb05

Browse files
FelipeAdachifelipe207
andauthored
Ranking Metrics - Better Precision/Recall/MRR calculation (#1492)
## Description This PR: - Makes the metrics calculation (other than NDCG and AP, which are already correct) correct for boolean and numeric scenarios - Changes ndcg from a non-mergeable batch metric to a mergeable row-wise metric - Corrects top_rank to return None instead of 0 when there is no relevant item (since lower is better for top_rank, assigning a 0 is not correct) - Changes MRR from a non-mergeable batch metric to a mergeable row-wise metric (and renamed from `Mean Reciprocal Rank` to `Reciprocal Rank`) - Refactors the code to group row-wise statistics calculations functions After these changes, the only metric that makes the ranking metrics non-mergeable is `accuracy_k` - [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md) and the [Code of Conduct](CODE_OF_CONDUCT.md). --------- Co-authored-by: felipe207 <[email protected]>
1 parent db15e49 commit 538eb05

File tree

3 files changed

+137
-94
lines changed

3 files changed

+137
-94
lines changed

python/examples/experimental/Writing_Ranking_Performance_Metrics_to_WhyLabs.ipynb

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -206,19 +206,14 @@
206206
" <td>1.000000</td>\n",
207207
" </tr>\n",
208208
" <tr>\n",
209-
" <th>mean_reciprocal_rank</th>\n",
210-
" <td>1</td>\n",
211-
" <td>0.333333</td>\n",
212-
" </tr>\n",
213-
" <tr>\n",
214209
" <th>norm_dis_cumul_gain_k_3</th>\n",
215210
" <td>1</td>\n",
216211
" <td>1.000000</td>\n",
217212
" </tr>\n",
218213
" <tr>\n",
219214
" <th>precision_k_3</th>\n",
220215
" <td>1</td>\n",
221-
" <td>0.333333</td>\n",
216+
" <td>0.666667</td>\n",
222217
" </tr>\n",
223218
" <tr>\n",
224219
" <th>predictions</th>\n",
@@ -231,14 +226,19 @@
231226
" <td>1.000000</td>\n",
232227
" </tr>\n",
233228
" <tr>\n",
229+
" <th>reciprocal_rank</th>\n",
230+
" <td>1</td>\n",
231+
" <td>1.000000</td>\n",
232+
" </tr>\n",
233+
" <tr>\n",
234234
" <th>targets</th>\n",
235235
" <td>1</td>\n",
236236
" <td>0.000000</td>\n",
237237
" </tr>\n",
238238
" <tr>\n",
239239
" <th>top_rank</th>\n",
240240
" <td>1</td>\n",
241-
" <td>3.000000</td>\n",
241+
" <td>1.000000</td>\n",
242242
" </tr>\n",
243243
" </tbody>\n",
244244
"</table>\n",
@@ -249,13 +249,13 @@
249249
"column \n",
250250
"accuracy_k_3 1 1.000000\n",
251251
"average_precision_k_3 1 1.000000\n",
252-
"mean_reciprocal_rank 1 0.333333\n",
253252
"norm_dis_cumul_gain_k_3 1 1.000000\n",
254-
"precision_k_3 1 0.333333\n",
253+
"precision_k_3 1 0.666667\n",
255254
"predictions 1 0.000000\n",
256255
"recall_k_3 1 1.000000\n",
256+
"reciprocal_rank 1 1.000000\n",
257257
"targets 1 0.000000\n",
258-
"top_rank 1 3.000000"
258+
"top_rank 1 1.000000"
259259
]
260260
},
261261
"execution_count": 4,

python/tests/experimental/api/test_logger.py

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_log_batch_ranking_metrics_single_simple():
2323

2424
column_names = [
2525
"accuracy_k_3",
26-
"mean_reciprocal_rank",
26+
"reciprocal_rank",
2727
"precision_k_3",
2828
"recall_k_3",
2929
"top_rank",
@@ -33,17 +33,22 @@ def test_log_batch_ranking_metrics_single_simple():
3333
for col in column_names:
3434
assert col in pandas_summary.index
3535
assert pandas_summary.loc["accuracy_k_3", "counts/n"] == 1
36-
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
36+
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
3737
assert pandas_summary.loc["precision_k_3", "counts/n"] == 4
3838
assert pandas_summary.loc["recall_k_3", "counts/n"] == 4
3939
assert pandas_summary.loc["top_rank", "counts/n"] == 4
4040
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
41-
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
4241
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
43-
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
42+
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 4
4443
# ndcg = [1, 0, 0.63, 0.5]
4544
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_3", "distribution/mean"], 0.53273, abs_tol=0.00001)
4645
assert isclose(pandas_summary.loc["average_precision_k_3", "distribution/mean"], 0.45833, abs_tol=0.00001)
46+
assert isclose(pandas_summary.loc["precision_k_3", "distribution/mean"], 0.25, abs_tol=0.00001)
47+
assert isclose(pandas_summary.loc["recall_k_3", "distribution/mean"], 1.0, abs_tol=0.00001)
48+
# rr = [1, 0, 0.5, 0.33333]
49+
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.45833, abs_tol=0.00001)
50+
assert isclose(pandas_summary.loc["accuracy_k_3", "distribution/mean"], 0.75, abs_tol=0.00001)
51+
assert isclose(pandas_summary.loc["sum_gain_k_3", "distribution/mean"], 0.75, abs_tol=0.00001)
4752

4853

4954
def test_log_batch_ranking_metrics_binary_simple():
@@ -57,7 +62,7 @@ def test_log_batch_ranking_metrics_binary_simple():
5762
k = 2
5863
column_names = [
5964
"accuracy_k_" + str(k),
60-
"mean_reciprocal_rank",
65+
"reciprocal_rank",
6166
"precision_k_" + str(k),
6267
"recall_k_" + str(k),
6368
"top_rank",
@@ -67,16 +72,22 @@ def test_log_batch_ranking_metrics_binary_simple():
6772
for col in column_names:
6873
assert col in pandas_summary.index
6974
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
70-
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
75+
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
7176
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 4
7277
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 4
7378
assert pandas_summary.loc["top_rank", "counts/n"] == 4
7479
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
75-
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
80+
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 4
7681
# ndcg@2 = [0.613147, 1.0, 1.0, 0.63093]
7782
# average_precision_k_2 = [1.0, 0.0, 1.0, 0.5]
7883
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "distribution/mean"], 0.81101, abs_tol=0.00001)
7984
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.62500, abs_tol=0.00001)
85+
assert isclose(pandas_summary.loc["precision_k_" + str(k), "distribution/mean"], 0.5, abs_tol=0.00001)
86+
assert isclose(pandas_summary.loc["recall_k_" + str(k), "distribution/mean"], 0.83333, abs_tol=0.00001)
87+
# rr = [1, 0, 1, 0.5]
88+
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.625, abs_tol=0.00001)
89+
assert isclose(pandas_summary.loc["accuracy_k_2", "distribution/mean"], 0.75, abs_tol=0.00001)
90+
assert isclose(pandas_summary.loc["sum_gain_k_2", "distribution/mean"], 1.0, abs_tol=0.00001)
8091

8192

8293
def test_log_batch_ranking_metrics_multiple_simple():
@@ -104,7 +115,7 @@ def test_log_batch_ranking_metrics_multiple_simple():
104115

105116
column_names = [
106117
"accuracy_k_" + str(k),
107-
"mean_reciprocal_rank",
118+
"reciprocal_rank",
108119
"precision_k_" + str(k),
109120
"recall_k_" + str(k),
110121
"top_rank",
@@ -114,16 +125,17 @@ def test_log_batch_ranking_metrics_multiple_simple():
114125
for col in column_names:
115126
assert col in pandas_summary.index
116127
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
117-
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
128+
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
118129
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 4
119130
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 4
120131
assert pandas_summary.loc["top_rank", "counts/n"] == 4
121132
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
122-
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
133+
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 4
123134
# ndcg@3 = [0.9197, 0.0, 1.0, 0.386853]
124135
# average_precision_k_3 = [0.83, 0.0, 1.0, 0.5]
125-
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.57664, abs_tol=0.00001)
136+
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/mean"], 0.57664, abs_tol=0.00001)
126137
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.58333, abs_tol=0.00001)
138+
assert isclose(pandas_summary.loc["sum_gain_k_" + str(k), "distribution/mean"], 1.25, abs_tol=0.00001)
127139

128140

129141
def test_log_batch_ranking_metrics_default_target():
@@ -135,7 +147,7 @@ def test_log_batch_ranking_metrics_default_target():
135147
k = 3
136148
column_names = [
137149
"accuracy_k_" + str(k),
138-
"mean_reciprocal_rank",
150+
"reciprocal_rank",
139151
"precision_k_" + str(k),
140152
"recall_k_" + str(k),
141153
"top_rank",
@@ -145,7 +157,7 @@ def test_log_batch_ranking_metrics_default_target():
145157
for col in column_names:
146158
assert col in pandas_summary.index
147159
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
148-
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
160+
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 1
149161
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 1
150162
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 1
151163
assert pandas_summary.loc["top_rank", "counts/n"] == 1
@@ -155,6 +167,8 @@ def test_log_batch_ranking_metrics_default_target():
155167
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.90130, abs_tol=0.00001)
156168
# AP assumes binary relevance - this case doesn't raise an error, just a warning, but the result is not meaningful
157169
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 1.00000, abs_tol=0.00001)
170+
assert isclose(pandas_summary.loc["accuracy_k_3", "distribution/mean"], 1.0, abs_tol=0.00001)
171+
assert isclose(pandas_summary.loc["sum_gain_k_3", "distribution/mean"], 8.0, abs_tol=0.00001)
158172

159173

160174
def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():
@@ -195,6 +209,10 @@ def test_log_batch_ranking_metrics_average_precision_sklearn_example():
195209
pandas_summary = result.view().to_pandas()
196210

197211
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.83333, abs_tol=0.00001)
212+
assert isclose(pandas_summary.loc["precision_k_" + str(k), "distribution/mean"], 0.5, abs_tol=0.00001)
213+
assert isclose(pandas_summary.loc["recall_k_" + str(k), "distribution/mean"], 1.0, abs_tol=0.00001)
214+
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 1.0, abs_tol=0.00001)
215+
assert isclose(pandas_summary.loc["sum_gain_k_" + str(k), "distribution/mean"], 2.0, abs_tol=0.00001)
198216

199217

200218
def test_log_batch_ranking_metrics_average_precision():
@@ -215,3 +233,4 @@ def test_log_batch_ranking_metrics_average_precision():
215233
assert isclose(
216234
pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], res[1], abs_tol=0.00001
217235
)
236+
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.45833, abs_tol=0.00001)

0 commit comments

Comments
 (0)