Merge pull request #515 from ruby/mvh-improve-rss

eightbitraptor · web-flow · commit 0c63ef798e35 · 2026-06-11T10:56:42.000+01:00
Improve RSS measurement
diff --git a/README.md b/README.md
@@ -284,12 +284,22 @@ after each iteration with the default harness.
 
 ## Measuring memory usage
 
-`--rss` option of `run_benchmarks.rb` allows you to measure RSS after benchmark iterations.
+`--rss` option of `run_benchmarks.rb` allows you to measure RSS (resident set size).
 
 ```
 ./run_benchmarks.rb --rss
 ```
 
+The harness samples RSS once per iteration across the benchmarking window (after
+warmup), so the `RSS (MiB)` column reports the mean working set during measurement
+along with its run-to-run variability (`mean ± stddev%`), and the `RSS` ratio is
+computed from those means. The raw per-iteration samples are stored in the JSON
+output under `rss_samples` (bytes).
+
+For reference, the JSON output also keeps `rss`, a single snapshot taken after a
+full GC at the end of the run (the retained set, a lower bound), and `maxrss`, the
+process's lifetime peak from `getrusage`.
+
 ## Rendering a graph
 
 `--graph` option of `run_benchmarks.rb` allows you to render benchmark results as a graph.
diff --git a/harness-gc/harness.rb b/harness-gc/harness.rb
@@ -33,6 +33,7 @@ def gc_stat_heap_delta(before, after)
 
 def run_benchmark(_num_itrs_hint, **, &block)
   times = []
+  rss_samples = []
   marking_times = []
   sweeping_times = []
   gc_counts = []
@@ -82,6 +83,7 @@ def run_benchmark(_num_itrs_hint, **, &block)
     puts itr_str
 
     times << time
+    rss_samples << get_rss
     marking_times << mark_delta
     sweeping_times << sweep_delta
     gc_counts << count_delta
@@ -95,6 +97,8 @@ def run_benchmark(_num_itrs_hint, **, &block)
   bench_range = WARMUP_ITRS..-1
 
   extra = {}
+  rss_bench = rss_samples[bench_range] || []
+  extra["rss_samples"] = rss_bench unless rss_bench.empty?
   extra["gc_marking_time_warmup"] = marking_times[warmup_range]
   extra["gc_marking_time_bench"] = marking_times[bench_range]
   extra["gc_sweeping_time_warmup"] = sweeping_times[warmup_range]
diff --git a/harness-warmup/harness.rb b/harness-warmup/harness.rb
@@ -36,10 +36,12 @@ def print_stats(bench, elapsed)
 def run_benchmark(num_itrs_hint, **)
   start = monotonic_time
   times = []
+  rss_samples = []
 
   begin
     time = Benchmark.realtime { yield }
     times << time
+    rss_samples << get_rss
 
     stats = Stats.new(times)
     median = stats.median
@@ -63,7 +65,9 @@ def run_benchmark(num_itrs_hint, **)
   end until times.size >= MIN_ITERS and elapsed >= MIN_TIME and mad <= threshold
 
   warmup, bench = times[0...times.size/2], times[times.size/2..-1]
-  return_results(warmup, bench)
+  rss_bench = rss_samples[times.size/2..-1] || []
+  extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench }
+  return_results(warmup, bench, **extra)
 
   print_stats(bench, elapsed)
 end
diff --git a/harness/harness-common.rb b/harness/harness-common.rb
@@ -1,4 +1,5 @@
 require 'rbconfig'
+require_relative '../misc/stats'
 
 # Ensure the ruby in PATH is the ruby running this, so we can safely shell out to other commands
 ruby_in_path = `ruby -e 'print RbConfig.ruby'`
@@ -214,6 +215,17 @@ def return_results(warmup_iterations, bench_iterations, **extra)
     puts "MAXRSS: %.1fMiB" % (maxrss / 1024.0 / 1024.0)
   end
 
+  rss_samples = ruby_bench_results["rss_samples"]
+  if rss_samples.is_a?(Array) && !rss_samples.empty?
+    mib = rss_samples.map { |bytes| bytes / 1024.0 / 1024.0 }
+    stats = Stats.new(mib)
+    median = stats.median
+    mad = stats.median_absolute_deviation(median)
+    puts "RSS sampled (n=%d): median %.1fMiB \u00b1 %.1fMiB (MAD), range [%.1f, %.1f]MiB" % [
+      mib.size, median, mad, stats.min, stats.max
+    ]
+  end
+
   write_json_file(ruby_bench_results)
 end
 
diff --git a/harness/harness.rb b/harness/harness.rb
@@ -34,6 +34,7 @@ def realtime
 # Takes a block as input
 def run_benchmark(_num_itrs_hint, **, &block)
   times = []
+  rss_samples = []
   total_time = 0
   num_itrs = 0
   header = "itr:   time"
@@ -75,10 +76,15 @@ def run_benchmark(_num_itrs_hint, **, &block)
     # We internally save the time in seconds to avoid loss of precision
     times << time
     total_time += time
+    # Sample current RSS between iterations (outside the timed block) so we can
+    # report the working set across the window with variance.
+    rss_samples << get_rss
   end until num_itrs >= WARMUP_ITRS + MIN_BENCH_ITRS and total_time >= MIN_BENCH_TIME
 
   warmup, bench = times[0...WARMUP_ITRS], times[WARMUP_ITRS..-1]
-  return_results(warmup, bench)
+  rss_bench = rss_samples[WARMUP_ITRS..-1] || []
+  extra = rss_bench.empty? ? {} : { "rss_samples" => rss_bench }
+  return_results(warmup, bench, **extra)
 
   non_warmups = times[WARMUP_ITRS..-1]
   if non_warmups.size > 1
diff --git a/lib/results_table_builder.rb b/lib/results_table_builder.rb
@@ -12,6 +12,7 @@ def initialize(executable_names:, bench_data:, include_rss: false, include_pvalu
     @include_pvalue = include_pvalue
     @zjit_stats = zjit_stats || []
     @include_gc = detect_gc_data(bench_data)
+    @rss_has_samples = @include_rss && detect_rss_samples(bench_data)
     @base_name = executable_names.first
     @other_names = executable_names[1..]
     @bench_names = compute_bench_names
@@ -86,7 +87,7 @@ def build_format
 
     @executable_names.each do |_name|
       format << "%s"
-      format << "%.1f" if @include_rss
+      format << (@rss_has_samples ? "%s" : "%.1f") if @include_rss
       @zjit_stats.each { format << "%s" }
       if @include_gc
         format << "%s"
@@ -125,11 +126,15 @@ def build_row(bench_name)
     t0s = extract_first_iteration_times(bench_name)
     times_no_warmup = extract_benchmark_times(bench_name)
     rsss = extract_rss_values(bench_name)
+    rss_series = @rss_has_samples ? extract_rss_series(bench_name) : nil
 
     base_t0, *other_t0s = t0s
     base_t, *other_ts = times_no_warmup
     base_rss, *other_rsss = rsss
 
+    base_rss_cell = rss_cell(base_rss, rss_series && rss_series[0])
+    other_rss_cells = other_rsss.each_index.map { |i| rss_cell(other_rsss[i], rss_series && rss_series[i + 1]) }
+
     # Extract zjit stats: { stat_name => [base_val, other1_val, ...] }
     zjit_stat_values = @zjit_stats.map do |stat|
       [stat, extract_zjit_stat(bench_name, stat)]
@@ -143,8 +148,8 @@ def build_row(bench_name)
     end
 
     row = [bench_name]
-    build_base_columns(row, base_t, base_rss, zjit_stat_values, 0, base_mark, base_sweep)
-    build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps)
+    build_base_columns(row, base_t, base_rss_cell, zjit_stat_values, 0, base_mark, base_sweep)
+    build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps)
     build_ratio_columns(row, base_t0, other_t0s, base_t, other_ts)
     build_rss_ratio_columns(row, base_rss, other_rsss)
     build_gc_ratio_columns(row, base_mark, other_marks, base_sweep, other_sweeps)
@@ -162,10 +167,10 @@ def build_base_columns(row, base_t, base_rss, zjit_stat_values, exe_index, base_
     end
   end
 
-  def build_comparison_columns(row, other_ts, other_rsss, zjit_stat_values, other_marks, other_sweeps)
+  def build_comparison_columns(row, other_ts, other_rss_cells, zjit_stat_values, other_marks, other_sweeps)
     other_ts.each_with_index do |other_t, i|
       row << format_time_with_stddev(other_t)
-      row << other_rsss[i] if @include_rss
+      row << other_rss_cells[i] if @include_rss
       zjit_stat_values.each { |_stat, values| row << format_stat(values[i + 1]) }
       if @include_gc
         row << format_time_with_stddev(other_marks[i])
@@ -283,9 +288,38 @@ def extract_benchmark_times(bench_name)
     end
   end
 
+  # Numeric RSS (MiB) per executable, used for the RSS ratio. When per-iteration
+  # samples are present we use their mean so the ratio matches the displayed value.
   def extract_rss_values(bench_name)
     @executable_names.map do |name|
-      bench_data_for(name, bench_name)['rss'] / BYTES_TO_MIB
+      data = bench_data_for(name, bench_name)
+      samples = data['rss_samples']
+      if samples.is_a?(Array) && !samples.empty?
+        mean(samples) / BYTES_TO_MIB
+      else
+        data['rss'] / BYTES_TO_MIB
+      end
+    end
+  end
+
+  # Per-iteration RSS samples (MiB) per executable, or nil when a run lacks them.
+  def extract_rss_series(bench_name)
+    @executable_names.map do |name|
+      samples = bench_data_for(name, bench_name)['rss_samples']
+      next nil unless samples.is_a?(Array) && !samples.empty?
+      samples.map { |bytes| bytes / BYTES_TO_MIB }
+    end
+  end
+
+  # Display value for an RSS column: mean ± stddev% when samples exist (matching
+  # the timing columns), otherwise a plain MiB value. Returns a Float when no run
+  # in the suite has samples, preserving the legacy "%.1f" formatting.
+  def rss_cell(mean_value, series)
+    return mean_value unless @rss_has_samples
+    if series && !series.empty?
+      format_time_with_stddev(series)
+    else
+      "%.1f" % mean_value
     end
   end
 
@@ -305,6 +339,12 @@ def detect_gc_data(bench_data)
     bench_data.values.any? { |benchmarks| benchmarks.values.any? { |d| d.is_a?(Hash) && d.key?('gc_marking_time_bench') } }
   end
 
+  def detect_rss_samples(bench_data)
+    bench_data.values.any? do |benchmarks|
+      benchmarks.values.any? { |d| d.is_a?(Hash) && d['rss_samples'].is_a?(Array) && !d['rss_samples'].empty? }
+    end
+  end
+
   def bench_data_for(name, bench_name)
     @bench_data[name][bench_name]
   end
diff --git a/test/results_table_builder_test.rb b/test/results_table_builder_test.rb
@@ -549,4 +549,125 @@
       assert_equal 'fib', bench_names[4]
     end
   end
+
+  describe 'RSS sampling (rss_samples)' do
+    MIB = 1024 * 1024
+
+    it 'shows mean ± stddev% and uses %s format when samples are present' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1, 0.1],
+            'rss' => 10 * MIB,
+            'rss_samples' => [9 * MIB, 10 * MIB, 11 * MIB]
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, format = builder.build
+
+      assert_equal ['bench', 'ruby (ms)', 'RSS (MiB)'], table[0]
+      assert_equal ['%s', '%s', '%s'], format
+
+      m = table[1][2].match(/\A(\d+\.\d) ± (\d+\.\d)%\z/)
+      assert m, "expected mean ± stddev%, got #{table[1][2].inspect}"
+      assert_in_delta 10.0, m[1].to_f, 0.1
+      assert_operator m[2].to_f, :>, 0.0
+    end
+
+    it 'computes the RSS ratio from the mean of samples' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1, 0.1],
+            'rss' => 99 * MIB, # should be ignored in favour of samples
+            'rss_samples' => [10 * MIB, 10 * MIB, 10 * MIB]
+          }
+        },
+        'ruby-yjit' => {
+          'fib' => {
+            'warmup' => [0.05],
+            'bench' => [0.05, 0.05, 0.05],
+            'rss' => 1 * MIB,
+            'rss_samples' => [18 * MIB, 20 * MIB, 22 * MIB]
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby', 'ruby-yjit'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, _format = builder.build
+
+      # ratio = mean(ruby samples) / mean(yjit samples) = 10 / 20 = 0.5
+      assert_in_delta 0.5, table[1].last, 0.001
+    end
+
+    it 'falls back to a plain MiB value for runs without samples in a mixed suite' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1, 0.1],
+            'rss' => 10 * MIB,
+            'rss_samples' => [10 * MIB, 10 * MIB]
+          },
+          'loop' => {
+            'warmup' => [0.2],
+            'bench' => [0.2, 0.2],
+            'rss' => 15 * MIB
+            # no rss_samples for this benchmark
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      table, format = builder.build
+
+      # Suite has samples somewhere, so the RSS column is string-formatted.
+      assert_equal ['%s', '%s', '%s'], format
+
+      rows = table[1..].each_with_object({}) { |row, h| h[row[0]] = row }
+      assert_match(/\A\d+\.\d ± \d+\.\d%\z/, rows['fib'][2])
+      # The sample-less benchmark still renders as a bare MiB value.
+      assert_equal '15.0', rows['loop'][2]
+    end
+
+    it 'keeps %.1f formatting when no run in the suite has samples' do
+      bench_data = {
+        'ruby' => {
+          'fib' => {
+            'warmup' => [0.1],
+            'bench' => [0.1],
+            'rss' => 10 * MIB
+          }
+        }
+      }
+
+      builder = ResultsTableBuilder.new(
+        executable_names: ['ruby'],
+        bench_data: bench_data,
+        include_rss: true
+      )
+
+      _table, format = builder.build
+      assert_equal ['%s', '%s', '%.1f'], format
+    end
+  end
 end