fetch_ml/native/queue_index/storage/index_storage.cpp
Jeremie Fraeys 7efe8bbfbf
native: security hardening, research trustworthiness, and CVE mitigations
Security Fixes:
- CVE-2024-45339: Add O_EXCL flag to temp file creation in storage_write_entries()
  Prevents symlink attacks on predictable .tmp file paths
- CVE-2025-47290: Use openat_nofollow() in storage_open()
  Closes TOCTOU race condition via path_sanitizer infrastructure
- CVE-2025-0838: Add MAX_BATCH_SIZE=10000 to add_tasks()
  Prevents integer overflow in batch operations

Research Trustworthiness (dataset_hash):
- Deterministic file ordering: std::sort after collect_files()
- Recursive directory traversal: depth-limited with cycle detection
- Documented exclusions: hidden files and special files noted in API

Bug Fixes:
- R1: storage_init path validation for non-existent directories
- R2: safe_strncpy return value check before strcat
- R3: parallel_hash 256-file cap replaced with std::vector
- R4: wire qi_compact_index/qi_rebuild_index stubs
- R5: CompletionLatch race condition fix (hold mutex during decrement)
- R6: ARMv8 SHA256 transform fix (save abcd_pre before vsha256hq_u32)
- R7: fuzz_index_storage header format fix
- R8: enforce null termination in add_tasks/update_tasks
- R9: use 64 bytes (not 65) in combined hash to exclude null terminator
- R10: status field persistence in save()

New Tests:
- test_recursive_dataset.cpp: Verify deterministic recursive hashing
- test_storage_symlink_resistance.cpp: Verify CVE-2024-45339 fix
- test_queue_index_batch_limit.cpp: Verify CVE-2025-0838 fix
- test_sha256_arm_kat.cpp: ARMv8 known-answer tests
- test_storage_init_new_dir.cpp: F1 verification
- test_parallel_hash_large_dir.cpp: F3 verification
- test_queue_index_compact.cpp: F4 verification

All 8 native tests passing. Library ready for research lab deployment.
2026-02-21 13:33:45 -05:00

317 lines
9.2 KiB
C++

#include "index_storage.h"
#include "../../common/include/safe_math.h"
#include "../../common/include/path_sanitizer.h"
#include "../../common/include/secure_mem.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
#include <sys/file.h>
using fetchml::common::safe_mul;
using fetchml::common::safe_add;
using fetchml::common::canonicalize_and_validate;
using fetchml::common::safe_strncpy;
using fetchml::common::open_dir_nofollow;
using fetchml::common::openat_nofollow;
// Maximum index file size: 100MB
#define MAX_INDEX_SIZE (100 * 1024 * 1024)
// Maximum safe entries: 100MB / 256 bytes per entry = 419430
#define MAX_SAFE_ENTRIES (MAX_INDEX_SIZE / sizeof(DiskEntry))
// Simple recursive mkdir (replacement for std::filesystem::create_directories)
static bool mkdir_p(const char* path) {
char tmp[4096];
if (safe_strncpy(tmp, path, sizeof(tmp)) != 0) {
return false; // Path too long
}
// Remove trailing slash if present
size_t len = strlen(tmp);
if (len > 0 && tmp[len - 1] == '/') {
tmp[len - 1] = '\0';
}
// Try to create each component
for (char* p = tmp + 1; *p; ++p) {
if (*p == '/') {
*p = '\0';
mkdir(tmp, 0755);
*p = '/';
}
}
// Final component
return mkdir(tmp, 0755) == 0 || errno == EEXIST;
}
bool storage_init(IndexStorage* storage, const char* queue_dir) {
if (!storage) return false;
memset(storage, 0, sizeof(IndexStorage));
storage->fd = -1;
// Extract parent directory and validate it (must exist)
// The queue_dir itself may not exist yet (first-time init)
char parent[4096];
if (safe_strncpy(parent, queue_dir, sizeof(parent)) != 0) {
return false; // Path too long
}
char* last_slash = strrchr(parent, '/');
const char* base_name;
if (last_slash) {
*last_slash = '\0';
base_name = last_slash + 1;
} else {
safe_strncpy(parent, ".", sizeof(parent));
base_name = queue_dir;
}
// Validate parent directory (must already exist)
char canonical_parent[4096];
if (!canonicalize_and_validate(parent, canonical_parent, sizeof(canonical_parent))) {
return false;
}
// Build index path: canonical_parent + "/" + base_name + "/index.bin"
int written = snprintf(storage->index_path, sizeof(storage->index_path),
"%s/%s/index.bin", canonical_parent, base_name);
if (written < 0 || (size_t)written >= sizeof(storage->index_path)) {
return false; // Path too long
}
return true;
}
void storage_cleanup(IndexStorage* storage) {
if (!storage) return;
storage_close(storage);
}
bool storage_open(IndexStorage* storage) {
if (!storage || storage->fd >= 0) return false;
// Ensure directory exists (find last slash, create parent)
char parent[4096];
if (safe_strncpy(parent, storage->index_path, sizeof(parent)) != 0) {
return false; // Path too long
}
char* last_slash = strrchr(parent, '/');
char filename[256];
if (last_slash) {
safe_strncpy(filename, last_slash + 1, sizeof(filename));
*last_slash = '\0';
mkdir_p(parent);
} else {
return false; // No directory component in path
}
// Use open_dir_nofollow + openat_nofollow to prevent symlink attacks (CVE-2025-47290)
int dir_fd = open_dir_nofollow(parent);
if (dir_fd < 0) {
return false;
}
storage->fd = openat_nofollow(dir_fd, filename, O_RDWR | O_CREAT, 0640);
close(dir_fd);
if (storage->fd < 0) {
return false;
}
// Acquire exclusive lock to prevent concurrent corruption
if (flock(storage->fd, LOCK_EX | LOCK_NB) != 0) {
::close(storage->fd);
storage->fd = -1;
return false;
}
struct stat st;
if (fstat(storage->fd, &st) < 0) {
storage_close(storage);
return false;
}
if (st.st_size == 0) {
// Write header for new file
FileHeader header;
memcpy(header.magic, INDEX_MAGIC, 4);
header.version = CURRENT_VERSION;
header.entry_count = 0;
memset(header.reserved, 0, sizeof(header.reserved));
memset(header.padding, 0, sizeof(header.padding));
if (write(storage->fd, &header, sizeof(header)) != sizeof(header)) {
storage_close(storage);
return false;
}
}
return true;
}
void storage_close(IndexStorage* storage) {
if (!storage) return;
storage_munmap(storage);
if (storage->fd >= 0) {
::close(storage->fd);
storage->fd = -1;
}
}
bool storage_read_entries(IndexStorage* storage, DiskEntry* out_entries, size_t max_count, size_t* out_count) {
if (!storage || storage->fd < 0 || !out_entries) return false;
FileHeader header;
if (pread(storage->fd, &header, sizeof(header), 0) != sizeof(header)) {
return false;
}
if (memcmp(header.magic, INDEX_MAGIC, 4) != 0) {
return false;
}
// Validate entry_count against maximum safe value
if (header.entry_count > MAX_SAFE_ENTRIES) {
return false; // Reject corrupt/malicious index files
}
// Validate file size matches expected size (prevent partial reads)
struct stat st;
if (fstat(storage->fd, &st) < 0) {
return false;
}
size_t expected_size;
if (!safe_add(sizeof(FileHeader), header.entry_count * sizeof(DiskEntry), &expected_size)) {
return false; // Overflow in size calculation
}
if ((size_t)st.st_size < expected_size) {
return false; // File truncated or corrupt
}
size_t to_read = header.entry_count < max_count ? header.entry_count : max_count;
// Safe multiply for bytes calculation
size_t bytes;
if (!safe_mul(to_read, sizeof(DiskEntry), &bytes)) {
return false; // Overflow in bytes calculation
}
if (pread(storage->fd, out_entries, bytes, sizeof(FileHeader)) != (ssize_t)bytes) {
return false;
}
if (out_count) {
*out_count = to_read;
}
return true;
}
bool storage_write_entries(IndexStorage* storage, const DiskEntry* entries, size_t count) {
if (!storage || storage->fd < 0 || !entries) return false;
char tmp_path[4096 + 4];
if (safe_strncpy(tmp_path, storage->index_path, sizeof(tmp_path) - 4) != 0) {
return false; // Path too long
}
strcat(tmp_path, ".tmp");
// Create temp file with O_EXCL to prevent symlink attacks (CVE-2024-45339)
int tmp_fd = ::open(tmp_path, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0640);
if (tmp_fd < 0 && errno == EEXIST) {
// Stale temp file exists - remove and retry once
unlink(tmp_path);
tmp_fd = ::open(tmp_path, O_WRONLY | O_CREAT | O_EXCL | O_CLOEXEC, 0640);
}
if (tmp_fd < 0) {
return false;
}
// Write header
FileHeader header;
memcpy(header.magic, INDEX_MAGIC, 4);
header.version = CURRENT_VERSION;
header.entry_count = count;
memset(header.reserved, 0, sizeof(header.reserved));
memset(header.padding, 0, sizeof(header.padding));
if (write(tmp_fd, &header, sizeof(header)) != sizeof(header)) {
::close(tmp_fd);
unlink(tmp_path);
return false;
}
// Write entries with checked multiplication
size_t bytes;
if (!safe_mul(count, sizeof(DiskEntry), &bytes)) {
::close(tmp_fd);
unlink(tmp_path);
return false;
}
if (write(tmp_fd, entries, bytes) != (ssize_t)bytes) {
::close(tmp_fd);
unlink(tmp_path);
return false;
}
::close(tmp_fd);
// Atomic rename
if (rename(tmp_path, storage->index_path) != 0) {
unlink(tmp_path);
return false;
}
return true;
}
bool storage_mmap_for_read(IndexStorage* storage) {
if (!storage || storage->fd < 0) return false;
storage_munmap(storage);
struct stat st;
if (fstat(storage->fd, &st) < 0) {
return false;
}
if (st.st_size <= (off_t)sizeof(FileHeader)) {
return true; // Empty but valid
}
if (st.st_size > (off_t)MAX_INDEX_SIZE) {
return false; // File too large
}
storage->mmap_size = (size_t)st.st_size;
storage->mmap_ptr = mmap(nullptr, storage->mmap_size, PROT_READ, MAP_PRIVATE, storage->fd, 0);
return storage->mmap_ptr != MAP_FAILED;
}
void storage_munmap(IndexStorage* storage) {
if (!storage) return;
if (storage->mmap_ptr && storage->mmap_ptr != MAP_FAILED) {
munmap(storage->mmap_ptr, storage->mmap_size);
storage->mmap_ptr = nullptr;
storage->mmap_size = 0;
}
}
const DiskEntry* storage_mmap_entries(IndexStorage* storage) {
if (!storage || !storage->mmap_ptr || storage->mmap_ptr == MAP_FAILED) return nullptr;
return (const DiskEntry*)((const uint8_t*)storage->mmap_ptr + sizeof(FileHeader));
}
size_t storage_mmap_entry_count(IndexStorage* storage) {
if (!storage || !storage->mmap_ptr || storage->mmap_ptr == MAP_FAILED) return 0;
const FileHeader* header = (const FileHeader*)storage->mmap_ptr;
return header->entry_count;
}