fetch_ml/native/dataset_hash/io/file_hash.cpp
Jeremie Fraeys 7efe8bbfbf
native: security hardening, research trustworthiness, and CVE mitigations
Security Fixes:
- CVE-2024-45339: Add O_EXCL flag to temp file creation in storage_write_entries()
  Prevents symlink attacks on predictable .tmp file paths
- CVE-2025-47290: Use openat_nofollow() in storage_open()
  Closes TOCTOU race condition via path_sanitizer infrastructure
- CVE-2025-0838: Add MAX_BATCH_SIZE=10000 to add_tasks()
  Prevents integer overflow in batch operations

Research Trustworthiness (dataset_hash):
- Deterministic file ordering: std::sort after collect_files()
- Recursive directory traversal: depth-limited with cycle detection
- Documented exclusions: hidden files and special files noted in API

Bug Fixes:
- R1: storage_init path validation for non-existent directories
- R2: safe_strncpy return value check before strcat
- R3: parallel_hash 256-file cap replaced with std::vector
- R4: wire qi_compact_index/qi_rebuild_index stubs
- R5: CompletionLatch race condition fix (hold mutex during decrement)
- R6: ARMv8 SHA256 transform fix (save abcd_pre before vsha256hq_u32)
- R7: fuzz_index_storage header format fix
- R8: enforce null termination in add_tasks/update_tasks
- R9: use 64 bytes (not 65) in combined hash to exclude null terminator
- R10: status field persistence in save()

New Tests:
- test_recursive_dataset.cpp: Verify deterministic recursive hashing
- test_storage_symlink_resistance.cpp: Verify CVE-2024-45339 fix
- test_queue_index_batch_limit.cpp: Verify CVE-2025-0838 fix
- test_sha256_arm_kat.cpp: ARMv8 known-answer tests
- test_storage_init_new_dir.cpp: F1 verification
- test_parallel_hash_large_dir.cpp: F3 verification
- test_queue_index_compact.cpp: F4 verification

All 8 native tests passing. Library ready for research lab deployment.
2026-02-21 13:33:45 -05:00

106 lines
2.6 KiB
C++

#include "file_hash.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <string.h>
#include <stdlib.h>
int hash_file(const char* path, size_t buffer_size, char* out_hash) {
if (!path || !out_hash) return -1;
int fd = open(path, O_RDONLY | O_CLOEXEC);
if (fd < 0) {
return -1;
}
struct stat st;
if (fstat(fd, &st) < 0) {
close(fd);
return -1;
}
Sha256State hasher;
sha256_init(&hasher);
if (st.st_size == 0) {
// Empty file
uint8_t result[32];
sha256_finalize(&hasher, result);
close(fd);
// Convert to hex
static const char hex[] = "0123456789abcdef";
for (int i = 0; i < 32; i++) {
out_hash[i*2] = hex[(result[i] >> 4) & 0xf];
out_hash[i*2+1] = hex[result[i] & 0xf];
}
out_hash[64] = '\0';
return 0;
}
// Try memory map first
void* mapped = mmap(nullptr, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (mapped != MAP_FAILED) {
sha256_update(&hasher, (const uint8_t*)mapped, st.st_size);
munmap(mapped, st.st_size);
} else {
// Fallback to buffered read
uint8_t* buffer = (uint8_t*)malloc(buffer_size);
if (!buffer) {
close(fd);
return -1;
}
ssize_t n;
while ((n = read(fd, buffer, buffer_size)) > 0) {
sha256_update(&hasher, buffer, n);
}
free(buffer);
}
close(fd);
uint8_t result[32];
sha256_finalize(&hasher, result);
// Convert to hex
static const char hex[] = "0123456789abcdef";
for (int i = 0; i < 32; i++) {
out_hash[i*2] = hex[(result[i] >> 4) & 0xf];
out_hash[i*2+1] = hex[result[i] & 0xf];
}
out_hash[64] = '\0';
return 0;
}
// Hash a single file, allocating result buffer
char* hash_file_alloc(const char* path, size_t buffer_size) {
char* out_hash = (char*)malloc(65); // 64 hex + null
if (!out_hash) return nullptr;
if (hash_file(path, buffer_size, out_hash) != 0) {
free(out_hash);
return nullptr;
}
return out_hash;
}
int hash_files_batch(
const char* const* paths,
uint32_t count,
char** out_hashes,
size_t buffer_size) {
if (!paths || !out_hashes) return -1;
int all_success = 1;
for (uint32_t i = 0; i < count; ++i) {
out_hashes[i] = hash_file_alloc(paths[i], buffer_size);
if (out_hashes[i] == nullptr) {
all_success = 0;
}
}
return all_success ? 0 : -1;
}