#include "sha256_base.h" // ARMv8-A Cryptographic Extensions implementation #if defined(__aarch64__) || defined(_M_ARM64) #include static void transform_armv8(uint32_t* state, const uint8_t* data) { // Load message and reverse bytes within each 32-bit word (big-endian -> native) uint32x4_t w0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data))); uint32x4_t w1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16))); uint32x4_t w2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32))); uint32x4_t w3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48))); // Load current hash state (native endianness) uint32x4_t abcd = vld1q_u32(state); uint32x4_t efgh = vld1q_u32(state + 4); uint32x4_t abcd_orig = abcd; uint32x4_t efgh_orig = efgh; uint32x4_t abcd_pre; // Rounds 0-15 with pre-expanded message uint32x4_t k0 = vld1q_u32(&K[0]); uint32x4_t k1 = vld1q_u32(&K[4]); uint32x4_t k2 = vld1q_u32(&K[8]); uint32x4_t k3 = vld1q_u32(&K[12]); uint32x4_t tmp = vaddq_u32(w0, k0); abcd_pre = abcd; // Save pre-round state uint32x4_t abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; tmp = vaddq_u32(w1, k1); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; tmp = vaddq_u32(w2, k2); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; tmp = vaddq_u32(w3, k3); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; // Rounds 16-63: Message schedule expansion + rounds for (int i = 16; i < 64; i += 16) { // Schedule expansion for rounds i..i+3 uint32x4_t w4 = vsha256su0q_u32(w0, w1); w4 = vsha256su1q_u32(w4, w2, w3); k0 = vld1q_u32(&K[i]); tmp = vaddq_u32(w4, k0); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; // Schedule expansion for rounds i+4..i+7 uint32x4_t w5 = vsha256su0q_u32(w1, w2); w5 = vsha256su1q_u32(w5, w3, w4); k1 = vld1q_u32(&K[i + 4]); tmp = vaddq_u32(w5, k1); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; // Schedule expansion for rounds i+8..i+11 uint32x4_t w6 = vsha256su0q_u32(w2, w3); w6 = vsha256su1q_u32(w6, w4, w5); k2 = vld1q_u32(&K[i + 8]); tmp = vaddq_u32(w6, k2); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; // Schedule expansion for rounds i+12..i+15 uint32x4_t w7 = vsha256su0q_u32(w3, w4); w7 = vsha256su1q_u32(w7, w5, w6); k3 = vld1q_u32(&K[i + 12]); tmp = vaddq_u32(w7, k3); abcd_pre = abcd; abcd_new = vsha256hq_u32(abcd, efgh, tmp); efgh = vsha256h2q_u32(efgh, abcd_pre, tmp); abcd = abcd_new; // Rotate working variables w0 = w4; w1 = w5; w2 = w6; w3 = w7; } // Add original state back abcd = vaddq_u32(abcd, abcd_orig); efgh = vaddq_u32(efgh, efgh_orig); // Store result vst1q_u32(state, abcd); vst1q_u32(state + 4, efgh); } TransformFunc detect_armv8_transform(void) { return transform_armv8; } #else // No ARMv8 support TransformFunc detect_armv8_transform(void) { return nullptr; } #endif