#include "sha256_base.h" // ARMv8-A Cryptographic Extensions implementation #if defined(__aarch64__) || defined(_M_ARM64) #include static void transform_armv8(uint32_t* state, const uint8_t* data) { // Load the 512-bit message block into 4 128-bit vectors uint32x4_t w0 = vld1q_u32((const uint32_t*)data); uint32x4_t w1 = vld1q_u32((const uint32_t*)(data + 16)); uint32x4_t w2 = vld1q_u32((const uint32_t*)(data + 32)); uint32x4_t w3 = vld1q_u32((const uint32_t*)(data + 48)); // Reverse byte order (SHA256 uses big-endian words) w0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w0))); w1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w1))); w2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w2))); w3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(w3))); // Load current hash state uint32x4_t abcd = vld1q_u32(state); uint32x4_t efgh = vld1q_u32(state + 4); uint32x4_t abcd_orig = abcd; uint32x4_t efgh_orig = efgh; // Rounds 0-15 with pre-expanded message uint32x4_t k0 = vld1q_u32(&K[0]); uint32x4_t k1 = vld1q_u32(&K[4]); uint32x4_t k2 = vld1q_u32(&K[8]); uint32x4_t k3 = vld1q_u32(&K[12]); uint32x4_t tmp = vaddq_u32(w0, k0); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); tmp = vaddq_u32(w1, k1); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); tmp = vaddq_u32(w2, k2); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); tmp = vaddq_u32(w3, k3); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); // Rounds 16-63: Message schedule expansion + rounds for (int i = 16; i < 64; i += 16) { // Schedule expansion for rounds i..i+3 uint32x4_t w4 = vsha256su0q_u32(w0, w1); w4 = vsha256su1q_u32(w4, w2, w3); k0 = vld1q_u32(&K[i]); tmp = vaddq_u32(w4, k0); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); // Schedule expansion for rounds i+4..i+7 uint32x4_t w5 = vsha256su0q_u32(w1, w2); w5 = vsha256su1q_u32(w5, w3, w4); k1 = vld1q_u32(&K[i + 4]); tmp = vaddq_u32(w5, k1); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); // Schedule expansion for rounds i+8..i+11 uint32x4_t w6 = vsha256su0q_u32(w2, w3); w6 = vsha256su1q_u32(w6, w4, w5); k2 = vld1q_u32(&K[i + 8]); tmp = vaddq_u32(w6, k2); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); // Schedule expansion for rounds i+12..i+15 uint32x4_t w7 = vsha256su0q_u32(w3, w4); w7 = vsha256su1q_u32(w7, w5, w6); k3 = vld1q_u32(&K[i + 12]); tmp = vaddq_u32(w7, k3); efgh = vsha256h2q_u32(efgh, abcd, tmp); abcd = vsha256hq_u32(abcd, efgh, tmp); // Rotate working variables w0 = w4; w1 = w5; w2 = w6; w3 = w7; } // Add original state back abcd = vaddq_u32(abcd, abcd_orig); efgh = vaddq_u32(efgh, efgh_orig); // Store result vst1q_u32(state, abcd); vst1q_u32(state + 4, efgh); } TransformFunc detect_armv8_transform(void) { return transform_armv8; } #else // No ARMv8 support TransformFunc detect_armv8_transform(void) { return nullptr; } #endif