Basic implementation of excluding outliers

PragTob · PragTob · commit 6d421089efa8 · 2025-10-14T21:46:57.000+02:00
Seems to all work fine. But do add a test - although I figure
that one will be hard to do "properly".
diff --git a/README.md b/README.md
@@ -136,6 +136,8 @@ In addition, you can optionally output an extended set of statistics:
 * **sample size** - the number of measurements taken
 * **mode**        - the measured values that occur the most. Often one value, but can be multiple values if they occur exactly as often. If no value occurs at least twice, this value will be `nil`.
 
+Benchee can also [remove outliers](#remove-outliers).
+
 ## Installation
 
 Add `:benchee` to your list of dependencies in `mix.exs`:
@@ -303,6 +305,7 @@ So, what happens if a function executes too fast for Benchee to measure? If Benc
 * essentially every single measurement is now an average across 10 runs making lots of statistics less meaningful
 
 Benchee will print a big warning when this happens.
+
 #### Measuring Memory Consumption
 
 Starting with version 0.13, users can now get measurements of how much memory their benchmarked scenarios use. The measurement is **limited to the process that Benchee executes your provided code in** - i.e. other processes (like worker pools)/the whole BEAM isn't taken into account.
@@ -542,6 +545,21 @@ Enum."-map/2-lists^map/1-0-"/2                  10001 26.38 2282    0.23
 
 **Note about after_each hooks:** `after_each` hooks currently don't work when profiling a function, as they are not passed the return value of the function after the profiling run. It's already fixed on the elixir side and is waiting for release, likely in 1.14. It should then just work.
 
+### Remove Outliers
+
+Benchee can remove outliers from the gathered samples.
+That is, as determined by percentiles/quantiles (we follow [this approach](https://en.wikipedia.org/wiki/Interquartile_range#Outliers)).
+
+You can simply pass `exclude_outliers: true` to Benchee to trigger the removal of outliers.
+
+```elixir
+Benchee.run(jobs, exclude_outliers: true)
+```
+
+The outliers themselves (aka the samples that have been determined to be outliers)
+as well as the lower/upper bound after which samples are considered outliers are accessible
+in the `Benchee.Statistics` struct.
+
 ### Saving, loading and comparing previous runs
 
 Benchee can store the results of previous runs in a file and then load them again to compare them. For example this is useful to compare what was recorded on the main branch against a branch with performance improvements. You may also use this to benchmark across different exlixir/erlang versions.
diff --git a/lib/benchee/configuration.ex b/lib/benchee/configuration.ex
@@ -48,7 +48,8 @@ defmodule Benchee.Configuration do
             # It also generates less than 1GB in data (some of which is garbage collected/
             # not necessarily all in RAM at the same time) - which seems reasonable enough.
             # see `samples/statistics_performance.exs` and also maybe run it yourself.
-            max_sample_size: 1_000_000
+            max_sample_size: 1_000_000,
+            exclude_outliers: false
 
   @typedoc """
   The configuration supplied by the user as either a map or a keyword list
@@ -152,6 +153,11 @@ defmodule Benchee.Configuration do
     This is used to limit memory consumption and unnecessary processing - 1 Million samples is plenty.
     This limit also applies to number of iterations done during warmup.
     You can set your own number or set it to `nil` if you don't want any limit.
+    * `exclude_outliers` - whether or not statistical outliers should be removed for the calculated statistics.
+    Defaults to `false`.
+    This means that values that are far outside the usual range (as determined by the percentiles/quantiles) will
+    be removed from the gathered samples and the calculated statistics. You might want to enable this if you
+    don't want things like the garbage collection triggering to influence your results as much.
   """
   @type user_configuration :: map | keyword
 
@@ -183,7 +189,8 @@ defmodule Benchee.Configuration do
           measure_function_call_overhead: boolean,
           title: String.t() | nil,
           profile_after: boolean | atom | {atom, keyword},
-          max_sample_size: pos_integer()
+          max_sample_size: pos_integer(),
+          exclude_outliers: boolean()
         }
 
   @time_keys [:time, :warmup, :memory_time, :reduction_time]
diff --git a/lib/benchee/statistics.ex b/lib/benchee/statistics.ex
@@ -121,7 +121,7 @@ defmodule Benchee.Statistics do
       ...>     input: "Input"
       ...>   }
       ...> ]
-      ...> 
+      ...>
       ...> suite = %Benchee.Suite{scenarios: scenarios}
       ...> statistics(suite, Benchee.Test.FakeProgressPrinter)
       %Benchee.Suite{
@@ -179,15 +179,17 @@ defmodule Benchee.Statistics do
     printer.calculating_statistics(suite.configuration)
 
     percentiles = suite.configuration.percentiles
+    exclude_outliers? = suite.configuration.exclude_outliers
 
     update_in(suite.scenarios, fn scenarios ->
-      scenario_statistics = compute_statistics_in_parallel(scenarios, percentiles)
+      scenario_statistics =
+        compute_statistics_in_parallel(scenarios, percentiles, exclude_outliers?)
 
       update_scenarios_with_statistics(scenarios, scenario_statistics)
     end)
   end
 
-  defp compute_statistics_in_parallel(scenarios, percentiles) do
+  defp compute_statistics_in_parallel(scenarios, percentiles, exclude_outliers?) do
     scenarios
     |> Enum.map(fn scenario ->
       # we filter down the data here to avoid sending the input and benchmarking function to
@@ -200,7 +202,7 @@ defmodule Benchee.Statistics do
     # async_stream as we might run a ton of scenarios depending on the benchmark
     |> Task.async_stream(
       fn scenario_collection_data ->
-        calculate_scenario_statistics(scenario_collection_data, percentiles)
+        calculate_scenario_statistics(scenario_collection_data, percentiles, exclude_outliers?)
       end,
       timeout: :infinity,
       ordered: true
@@ -235,27 +237,33 @@ defmodule Benchee.Statistics do
     end)
   end
 
-  defp calculate_scenario_statistics({run_time_data, memory_data, reductions_data}, percentiles) do
+  defp calculate_scenario_statistics(
+         {run_time_data, memory_data, reductions_data},
+         percentiles,
+         exclude_outliers?
+       ) do
     run_time_stats =
       run_time_data.samples
-      |> calculate_statistics(percentiles)
+      |> calculate_statistics(percentiles, exclude_outliers?)
       |> add_ips
 
-    memory_stats = calculate_statistics(memory_data.samples, percentiles)
-    reductions_stats = calculate_statistics(reductions_data.samples, percentiles)
+    memory_stats = calculate_statistics(memory_data.samples, percentiles, exclude_outliers?)
+
+    reductions_stats =
+      calculate_statistics(reductions_data.samples, percentiles, exclude_outliers?)
 
     {run_time_stats, memory_stats, reductions_stats}
   end
 
-  defp calculate_statistics([], _) do
+  defp calculate_statistics([], _, _) do
     %__MODULE__{
       sample_size: 0
     }
   end
 
-  defp calculate_statistics(samples, percentiles) do
+  defp calculate_statistics(samples, percentiles, exclude_outliers?) do
     samples
-    |> Statistex.statistics(percentiles: percentiles)
+    |> Statistex.statistics(percentiles: percentiles, exclude_outliers: exclude_outliers?)
     |> convert_from_statistex
   end
 
diff --git a/samples/outlier_removal.exs b/samples/outlier_removal.exs
@@ -0,0 +1,21 @@
+list = Enum.to_list(1..10_000)
+map_fun = fn i -> [i, i * i] end
+
+suite =
+  Benchee.run(
+    %{
+      "flat_map" => fn -> Enum.flat_map(list, map_fun) end,
+      "map.flatten" => fn -> list |> Enum.map(map_fun) |> List.flatten() end
+    },
+    formatters: [{Benchee.Formatters.Console, extended_statistics: true}],
+    exclude_outliers: true
+  )
+
+suite.scenarios
+|> Enum.map(fn scenario ->
+  statistics = scenario.run_time_data.statistics
+
+  {scenario.name, length(statistics.outliers), statistics.outliers,
+   statistics.lower_outlier_bound, statistics.upper_outlier_bound}
+end)
+|> IO.inspect()