#include "sha256_base.h"

// ARMv8-A Cryptographic Extensions implementation
#if defined(__aarch64__) || defined(_M_ARM64)
#include <arm_neon.h>

static void transform_armv8(uint32_t* state, const uint8_t* data) {
    // Load message and reverse bytes within each 32-bit word (big-endian -> native)
    uint32x4_t w0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data)));
    uint32x4_t w1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 16)));
    uint32x4_t w2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 32)));
    uint32x4_t w3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(data + 48)));
    
    // Load current hash state (native endianness)
    uint32x4_t abcd = vld1q_u32(state);
    uint32x4_t efgh = vld1q_u32(state + 4);
    uint32x4_t abcd_orig = abcd;
    uint32x4_t efgh_orig = efgh;
    uint32x4_t abcd_pre;
    
    // Rounds 0-15 with pre-expanded message
    uint32x4_t k0 = vld1q_u32(&K[0]);
    uint32x4_t k1 = vld1q_u32(&K[4]);
    uint32x4_t k2 = vld1q_u32(&K[8]);
    uint32x4_t k3 = vld1q_u32(&K[12]);
    
    uint32x4_t tmp = vaddq_u32(w0, k0);
    abcd_pre = abcd;  // Save pre-round state
    uint32x4_t abcd_new = vsha256hq_u32(abcd, efgh, tmp);
    efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
    abcd = abcd_new;
    
    tmp = vaddq_u32(w1, k1);
    abcd_pre = abcd;
    abcd_new = vsha256hq_u32(abcd, efgh, tmp);
    efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
    abcd = abcd_new;
    
    tmp = vaddq_u32(w2, k2);
    abcd_pre = abcd;
    abcd_new = vsha256hq_u32(abcd, efgh, tmp);
    efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
    abcd = abcd_new;
    
    tmp = vaddq_u32(w3, k3);
    abcd_pre = abcd;
    abcd_new = vsha256hq_u32(abcd, efgh, tmp);
    efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
    abcd = abcd_new;
    
    // Rounds 16-63: Message schedule expansion + rounds
    for (int i = 16; i < 64; i += 16) {
        // Schedule expansion for rounds i..i+3
        uint32x4_t w4 = vsha256su0q_u32(w0, w1);
        w4 = vsha256su1q_u32(w4, w2, w3);
        k0 = vld1q_u32(&K[i]);
        tmp = vaddq_u32(w4, k0);
        abcd_pre = abcd;
        abcd_new = vsha256hq_u32(abcd, efgh, tmp);
        efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
        abcd = abcd_new;
        
        // Schedule expansion for rounds i+4..i+7
        uint32x4_t w5 = vsha256su0q_u32(w1, w2);
        w5 = vsha256su1q_u32(w5, w3, w4);
        k1 = vld1q_u32(&K[i + 4]);
        tmp = vaddq_u32(w5, k1);
        abcd_pre = abcd;
        abcd_new = vsha256hq_u32(abcd, efgh, tmp);
        efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
        abcd = abcd_new;
        
        // Schedule expansion for rounds i+8..i+11
        uint32x4_t w6 = vsha256su0q_u32(w2, w3);
        w6 = vsha256su1q_u32(w6, w4, w5);
        k2 = vld1q_u32(&K[i + 8]);
        tmp = vaddq_u32(w6, k2);
        abcd_pre = abcd;
        abcd_new = vsha256hq_u32(abcd, efgh, tmp);
        efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
        abcd = abcd_new;
        
        // Schedule expansion for rounds i+12..i+15
        uint32x4_t w7 = vsha256su0q_u32(w3, w4);
        w7 = vsha256su1q_u32(w7, w5, w6);
        k3 = vld1q_u32(&K[i + 12]);
        tmp = vaddq_u32(w7, k3);
        abcd_pre = abcd;
        abcd_new = vsha256hq_u32(abcd, efgh, tmp);
        efgh = vsha256h2q_u32(efgh, abcd_pre, tmp);
        abcd = abcd_new;
        
        // Rotate working variables
        w0 = w4; w1 = w5; w2 = w6; w3 = w7;
    }
    
    // Add original state back
    abcd = vaddq_u32(abcd, abcd_orig);
    efgh = vaddq_u32(efgh, efgh_orig);
    
    // Store result
    vst1q_u32(state, abcd);
    vst1q_u32(state + 4, efgh);
}

TransformFunc detect_armv8_transform(void) {
    return transform_armv8;
}

#else // No ARMv8 support

TransformFunc detect_armv8_transform(void) { return nullptr; }

#endif