fetch_ml/native/dataset_hash/crypto/sha256_armv8.cpp
Jeremie Fraeys 43d241c28d
feat: implement C++ native libraries for performance-critical operations
- Add arena allocator for zero-allocation hot paths
- Add thread pool for parallel operations
- Add mmap utilities for memory-mapped I/O
- Implement queue_index with heap-based priority queue
- Implement dataset_hash with SIMD support (SHA-NI, ARMv8)
- Add runtime SIMD detection for cross-platform correctness
- Add comprehensive tests and benchmarks
2026-02-16 20:38:04 -05:00

103 lines
3.5 KiB
C++

#include "sha256_base.h"
// ARMv8-A Cryptographic Extensions implementation
#if defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>
static void transform_armv8(uint32_t* state, const uint8_t* data) {
// Load the 512-bit message block into 4 128-bit vectors
uint32x4_t w0 = vld1q_u32((const uint32_t*)data);
uint32x4_t w1 = vld1q_u32((const uint32_t*)(data + 16));
uint32x4_t w2 = vld1q_u32((const uint32_t*)(data + 32));
uint32x4_t w3 = vld1q_u32((const uint32_t*)(data + 48));
// Reverse byte order (SHA256 uses big-endian words)
w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0)));
w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1)));
w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2)));
w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3)));
// Load current hash state
uint32x4_t abcd = vld1q_u32(state);
uint32x4_t efgh = vld1q_u32(state + 4);
uint32x4_t abcd_orig = abcd;
uint32x4_t efgh_orig = efgh;
// Rounds 0-15 with pre-expanded message
uint32x4_t k0 = vld1q_u32(&K[0]);
uint32x4_t k1 = vld1q_u32(&K[4]);
uint32x4_t k2 = vld1q_u32(&K[8]);
uint32x4_t k3 = vld1q_u32(&K[12]);
uint32x4_t tmp = vaddq_u32(w0, k0);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
tmp = vaddq_u32(w1, k1);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
tmp = vaddq_u32(w2, k2);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
tmp = vaddq_u32(w3, k3);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
// Rounds 16-63: Message schedule expansion + rounds
for (int i = 16; i < 64; i += 16) {
// Schedule expansion for rounds i..i+3
uint32x4_t w4 = vsha256su0q_u32(w0, w1);
w4 = vsha256su1q_u32(w4, w2, w3);
k0 = vld1q_u32(&K[i]);
tmp = vaddq_u32(w4, k0);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
// Schedule expansion for rounds i+4..i+7
uint32x4_t w5 = vsha256su0q_u32(w1, w2);
w5 = vsha256su1q_u32(w5, w3, w4);
k1 = vld1q_u32(&K[i + 4]);
tmp = vaddq_u32(w5, k1);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
// Schedule expansion for rounds i+8..i+11
uint32x4_t w6 = vsha256su0q_u32(w2, w3);
w6 = vsha256su1q_u32(w6, w4, w5);
k2 = vld1q_u32(&K[i + 8]);
tmp = vaddq_u32(w6, k2);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
// Schedule expansion for rounds i+12..i+15
uint32x4_t w7 = vsha256su0q_u32(w3, w4);
w7 = vsha256su1q_u32(w7, w5, w6);
k3 = vld1q_u32(&K[i + 12]);
tmp = vaddq_u32(w7, k3);
efgh = vsha256h2q_u32(efgh, abcd, tmp);
abcd = vsha256hq_u32(abcd, efgh, tmp);
// Rotate working variables
w0 = w4; w1 = w5; w2 = w6; w3 = w7;
}
// Add original state back
abcd = vaddq_u32(abcd, abcd_orig);
efgh = vaddq_u32(efgh, efgh_orig);
// Store result
vst1q_u32(state, abcd);
vst1q_u32(state + 4, efgh);
}
TransformFunc detect_armv8_transform(void) {
return transform_armv8;
}
#else // No ARMv8 support
TransformFunc detect_armv8_transform(void) { return nullptr; }
#endif