Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .jules/thunderbolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,11 @@
**Evidence:** Microbenchmarking showed a 2x speedup (99ms -> 49ms) for max_v3 over max_v2 on L1-hot arrays. End-to-end framework benchmarks showed an 8% throughput increase (4.03 -> 4.36 GFLOP/s) on large fixed-memory allocations (N=6553600).

**Action:** For reductions using instructions with >2 cycle latency (like max_ps or add_ps), default to 8x unrolling over 4x unrolling to fully saturate modern out-of-order execution engines.

## 2025-02-20 - AVX2 Softmax Single-FMA Range Reduction

**Learning:** In transcendental AVX2 SIMD approximations (like exp256 for softmax kernels), combining constants for `r = x - n * ln(2)` into a single FMA instruction (`_mm256_fnmadd_ps(n, ln2_constant, x)`)—rather than splitting `ln(2)` for exact precision—can significantly boost throughput due to the shortened FMA dependency chain. Because operations like Softmax involve normalizing by the sum of exponentials, the slight precision losses from using a single FMA cancel out during the final division, keeping numerical outputs well within typical ML tolerances (e.g., 1e-4).

**Evidence:** Microbenchmark results (`ml_kernel_bench --filter 'softmax_v[56]'`) showed an increase in throughput from ~4.9 GFLOP/s to ~5.4 GFLOP/s (Fixed Memory mode, N=16384) by substituting the exact `ln(2)` split subtraction with a single FMA, while the output passed verification against the scalar baseline (`max_error < 1e-4`).

**Action:** When optimizing shift-invariant ML kernels like Softmax, prefer single-FMA range reductions for exponentiation (e.g., `x - n * ln(2)`) over split-precision constants to maximize ILP and instruction throughput, provided the error bound remains acceptable for the specific use case.
136 changes: 136 additions & 0 deletions ml_kernels/include/ml_kernels/softmax.h
Original file line number Diff line number Diff line change
Expand Up @@ -501,4 +501,140 @@ inline void softmax_v5(const float *input, float *output, std::size_t n) {
}
}


inline __m256 exp256_ps_v3(__m256 x) {
x = _mm256_max_ps(x, _mm256_set1_ps(-87.3f));
__m256 x_log2e = _mm256_mul_ps(x, _mm256_set1_ps(1.4426950408889634f));

__m256i n_int = _mm256_cvtps_epi32(x_log2e);
__m256 n = _mm256_cvtepi32_ps(n_int);

// Single FMA for r = x - n*ln2
__m256 r = _mm256_fnmadd_ps(n, _mm256_set1_ps(0.6931471805599453f), x);

// Horner's scheme
__m256 c1 = _mm256_set1_ps(1.0f);
__m256 c2 = _mm256_set1_ps(1.0f / 2.0f);
__m256 c3 = _mm256_set1_ps(1.0f / 6.0f);
__m256 c4 = _mm256_set1_ps(1.0f / 24.0f);
__m256 c5 = _mm256_set1_ps(1.0f / 120.0f);

__m256 p = _mm256_fmadd_ps(c5, r, c4);
p = _mm256_fmadd_ps(p, r, c3);
p = _mm256_fmadd_ps(p, r, c2);
p = _mm256_fmadd_ps(p, r, c1);
p = _mm256_fmadd_ps(p, r, c1);

__m256i exp_shift = _mm256_add_epi32(n_int, _mm256_set1_epi32(127));
__m256i exp_shifted = _mm256_slli_epi32(exp_shift, 23);
__m256 exp2n = _mm256_castsi256_ps(exp_shifted);

return _mm256_mul_ps(p, exp2n);
}

// ⚡ Thunderbolt: AVX2 Vectorized Softmax with single-FMA range reduction
// Target: AVX2 (Haswell+)
// Reason: Replaces split precision `ln(2)` range reduction with a single FMA. Since softmax normalizes by the sum,
// slight precision losses from using a single FMA cancel out during division, remaining within 1e-4 tolerance.
// Expected gain: ~5-10% throughput over softmax_v5 due to shorter dependency chain in exp.
inline void softmax_v6(const float *input, float *output, std::size_t n) {
if (n == 0) return;

// 1. Find max
std::size_t i = 0;
__m256 max_v = _mm256_set1_ps(std::numeric_limits<float>::lowest());
__m256 max0 = max_v, max1 = max_v, max2 = max_v, max3 = max_v;

for (; i + 31 < n; i += 32) {
max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
max1 = _mm256_max_ps(max1, _mm256_loadu_ps(input + i + 8));
max2 = _mm256_max_ps(max2, _mm256_loadu_ps(input + i + 16));
max3 = _mm256_max_ps(max3, _mm256_loadu_ps(input + i + 24));
}
max0 = _mm256_max_ps(max0, max1);
max2 = _mm256_max_ps(max2, max3);
max0 = _mm256_max_ps(max0, max2);
for (; i + 7 < n; i += 8) {
max0 = _mm256_max_ps(max0, _mm256_loadu_ps(input + i));
}
float max_val = ml_kernels::reduce_max(max0);

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Remove unnecessary namespace qualification for consistency.

Lines 560 and 604 explicitly qualify ml_kernels::reduce_max and ml_kernels::reduce_sum, but the code is already inside the ml_kernels namespace (declared at line 9). All other softmax variants in this file (v2-v5) call these functions without qualification (e.g., line 423 in softmax_v5).

🔧 Proposed fix
-    float max_val = ml_kernels::reduce_max(max0);
+    float max_val = reduce_max(max0);
-    float sum_val = ml_kernels::reduce_sum(sum0);
+    float sum_val = reduce_sum(sum0);

Also applies to: 604-604

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@ml_kernels/include/ml_kernels/softmax.h` at line 560, Inside the ml_kernels
namespace remove the redundant ml_kernels:: qualification when calling
reduce_max and reduce_sum in the softmax implementation (e.g., change
occurrences like float max_val = ml_kernels::reduce_max(max0) and similar
reduce_sum calls to use reduce_max and reduce_sum unqualified) so they match the
other softmax variants (such as softmax_v5) for consistency.

for (; i < n; ++i) max_val = std::max(max_val, input[i]);

__m256 max_vec = _mm256_set1_ps(max_val);

// 2. Compute exp and sum
i = 0;
__m256 sum0 = _mm256_setzero_ps();
__m256 sum1 = _mm256_setzero_ps();
__m256 sum2 = _mm256_setzero_ps();
__m256 sum3 = _mm256_setzero_ps();

for (; i + 31 < n; i += 32) {
__m256 x0 = _mm256_sub_ps(_mm256_loadu_ps(input + i), max_vec);
__m256 x1 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 8), max_vec);
__m256 x2 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 16), max_vec);
__m256 x3 = _mm256_sub_ps(_mm256_loadu_ps(input + i + 24), max_vec);

__m256 e0 = exp256_ps_v3(x0);
__m256 e1 = exp256_ps_v3(x1);
__m256 e2 = exp256_ps_v3(x2);
__m256 e3 = exp256_ps_v3(x3);

_mm256_storeu_ps(output + i, e0);
_mm256_storeu_ps(output + i + 8, e1);
_mm256_storeu_ps(output + i + 16, e2);
_mm256_storeu_ps(output + i + 24, e3);

sum0 = _mm256_add_ps(sum0, e0);
sum1 = _mm256_add_ps(sum1, e1);
sum2 = _mm256_add_ps(sum2, e2);
sum3 = _mm256_add_ps(sum3, e3);
}
sum0 = _mm256_add_ps(sum0, sum1);
sum2 = _mm256_add_ps(sum2, sum3);
sum0 = _mm256_add_ps(sum0, sum2);

for (; i + 7 < n; i += 8) {
__m256 x = _mm256_loadu_ps(input + i);
__m256 e = exp256_ps_v3(_mm256_sub_ps(x, max_vec));
_mm256_storeu_ps(output + i, e);
sum0 = _mm256_add_ps(sum0, e);
}

float sum_val = ml_kernels::reduce_sum(sum0);
for (; i < n; ++i) {
float e = std::exp(input[i] - max_val);
output[i] = e;
sum_val += e;
}

if (sum_val == 0.0f) return;

// 3. Normalize
float inv_sum = 1.0f / sum_val;
__m256 inv_sum_v = _mm256_set1_ps(inv_sum);
i = 0;
for (; i + 31 < n; i += 32) {
__m256 o0 = _mm256_loadu_ps(output + i);
__m256 o1 = _mm256_loadu_ps(output + i + 8);
__m256 o2 = _mm256_loadu_ps(output + i + 16);
__m256 o3 = _mm256_loadu_ps(output + i + 24);

__m256 m0 = _mm256_mul_ps(o0, inv_sum_v);
__m256 m1 = _mm256_mul_ps(o1, inv_sum_v);
__m256 m2 = _mm256_mul_ps(o2, inv_sum_v);
__m256 m3 = _mm256_mul_ps(o3, inv_sum_v);

_mm256_storeu_ps(output + i, m0);
_mm256_storeu_ps(output + i + 8, m1);
_mm256_storeu_ps(output + i + 16, m2);
_mm256_storeu_ps(output + i + 24, m3);
}
for (; i + 7 < n; i += 8) {
_mm256_storeu_ps(output + i, _mm256_mul_ps(_mm256_loadu_ps(output + i), inv_sum_v));
}
for (; i < n; ++i) {
output[i] *= inv_sum;
}
}
} // namespace ml_kernels
12 changes: 12 additions & 0 deletions ml_kernels/src/kernel_bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,18 @@ class SoftmaxV5Benchmark : public SoftmaxBenchmark {
};
REGISTER_BENCHMARK(SoftmaxV5Benchmark);

class SoftmaxV6Benchmark : public SoftmaxBenchmark {
public:
const char *name() const override { return "softmax_v6"; }

void run() override {
ml_kernels::softmax_v6(inputs_[current_idx_].data(), outputs_[current_idx_].data(), inputs_[0].size());
current_idx_ = (current_idx_ + 1) % pool_size_;
}
};
REGISTER_BENCHMARK(SoftmaxV6Benchmark);


} // namespace

int main(int argc, char **argv) {
Expand Down
31 changes: 31 additions & 0 deletions ml_kernels/src/test_naive_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,11 +181,42 @@ void test_softmax_v5() {
std::cout << "test_softmax_v5 passed!" << std::endl;
}


void test_softmax_v6() {
std::cout << "Running test_softmax_v6..." << std::endl;
std::vector<float> input = {
-2.0f, -0.5f, 1.0f, 3.0f,
0.0f, 0.0f, 0.0f, 0.0f,
100.0f, 100.0f, -100.0f, -100.0f,
5.0f, -5.0f, 2.0f, -2.0f,
1.1f, 1.2f, 1.3f, 1.4f,
-1.1f, -1.2f, -1.3f, -1.4f,
10.0f, 20.0f, 30.0f, 40.0f,
-10.0f, -20.0f, -30.0f, -40.0f
};

std::vector<float> output_naive(input.size(), 0.0f);
std::vector<float> output_v6(input.size(), 0.0f);

ml_kernels::softmax_naive(input.data(), output_naive.data(), input.size());
ml_kernels::softmax_v6(input.data(), output_v6.data(), input.size());

float sum = 0.0f;
for (std::size_t i = 0; i < input.size(); ++i) {
assert(std::fabs(output_naive[i] - output_v6[i]) < 1e-4f);
sum += output_v6[i];
}
assert(std::fabs(sum - 1.0f) < 1e-4f);

std::cout << "test_softmax_v6 passed!" << std::endl;
}

int main() {
test_relu_naive();
test_max_naive();
test_softmax_v3();
test_softmax_v4();
test_softmax_v5();
test_softmax_v6();
std::cout << "All tests passed successfully!" << std::endl;
}
Loading