fetch_ml/native/tests/test_recursive_dataset.cpp
Jeremie Fraeys 7efe8bbfbf
native: security hardening, research trustworthiness, and CVE mitigations
Security Fixes:
- CVE-2024-45339: Add O_EXCL flag to temp file creation in storage_write_entries()
  Prevents symlink attacks on predictable .tmp file paths
- CVE-2025-47290: Use openat_nofollow() in storage_open()
  Closes TOCTOU race condition via path_sanitizer infrastructure
- CVE-2025-0838: Add MAX_BATCH_SIZE=10000 to add_tasks()
  Prevents integer overflow in batch operations

Research Trustworthiness (dataset_hash):
- Deterministic file ordering: std::sort after collect_files()
- Recursive directory traversal: depth-limited with cycle detection
- Documented exclusions: hidden files and special files noted in API

Bug Fixes:
- R1: storage_init path validation for non-existent directories
- R2: safe_strncpy return value check before strcat
- R3: parallel_hash 256-file cap replaced with std::vector
- R4: wire qi_compact_index/qi_rebuild_index stubs
- R5: CompletionLatch race condition fix (hold mutex during decrement)
- R6: ARMv8 SHA256 transform fix (save abcd_pre before vsha256hq_u32)
- R7: fuzz_index_storage header format fix
- R8: enforce null termination in add_tasks/update_tasks
- R9: use 64 bytes (not 65) in combined hash to exclude null terminator
- R10: status field persistence in save()

New Tests:
- test_recursive_dataset.cpp: Verify deterministic recursive hashing
- test_storage_symlink_resistance.cpp: Verify CVE-2024-45339 fix
- test_queue_index_batch_limit.cpp: Verify CVE-2025-0838 fix
- test_sha256_arm_kat.cpp: ARMv8 known-answer tests
- test_storage_init_new_dir.cpp: F1 verification
- test_parallel_hash_large_dir.cpp: F3 verification
- test_queue_index_compact.cpp: F4 verification

All 8 native tests passing. Library ready for research lab deployment.
2026-02-21 13:33:45 -05:00

180 lines
4.7 KiB
C++

#include <cassert>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <string>
#include <sys/stat.h>
#include <unistd.h>
#include <limits.h>
#include "../native/dataset_hash/dataset_hash.h"
// Get absolute path of current working directory
static std::string get_cwd() {
char buf[PATH_MAX];
if (getcwd(buf, sizeof(buf)) != nullptr) {
return std::string(buf);
}
return "";
}
// Test helper: create a file with content
static int create_file(const char* path, const char* content) {
FILE* f = fopen(path, "w");
if (!f) return -1;
fprintf(f, "%s", content);
fclose(f);
return 0;
}
// Test: Recursive dataset hashing
// Verifies that nested directories are traversed and files are sorted
static int test_recursive_hashing() {
std::string cwd = get_cwd();
if (cwd.empty()) return -1;
char base_dir[4096];
snprintf(base_dir, sizeof(base_dir), "%s/test_recursive_XXXXXX", cwd.c_str());
if (mkdtemp(base_dir) == nullptr) return -1;
// Create nested structure
char subdir[4096];
char deeper[4096];
snprintf(subdir, sizeof(subdir), "%s/subdir", base_dir);
snprintf(deeper, sizeof(deeper), "%s/subdir/deeper", base_dir);
if (mkdir(subdir, 0755) != 0) {
rmdir(base_dir);
return -1;
}
if (mkdir(deeper, 0755) != 0) {
rmdir(subdir);
rmdir(base_dir);
return -1;
}
// Create files
char path_z[4096];
char path_b[4096];
char path_a[4096];
char path_deep[4096];
snprintf(path_z, sizeof(path_z), "%s/z_file.txt", base_dir);
snprintf(path_b, sizeof(path_b), "%s/subdir/b_file.txt", base_dir);
snprintf(path_a, sizeof(path_a), "%s/subdir/a_file.txt", base_dir);
snprintf(path_deep, sizeof(path_deep), "%s/subdir/deeper/deep_file.txt", base_dir);
if (create_file(path_z, "z content") != 0 ||
create_file(path_b, "b content") != 0 ||
create_file(path_a, "a content") != 0 ||
create_file(path_deep, "deep content") != 0) {
unlink(path_z); unlink(path_b); unlink(path_a); unlink(path_deep);
rmdir(deeper); rmdir(subdir); rmdir(base_dir);
return -1;
}
// Hash the directory
fh_context_t* ctx = fh_init(0);
if (!ctx) {
unlink(path_z); unlink(path_b); unlink(path_a); unlink(path_deep);
rmdir(deeper); rmdir(subdir); rmdir(base_dir);
return -1;
}
char* hash1 = fh_hash_directory(ctx, base_dir);
if (!hash1 || strlen(hash1) != 64) {
fh_cleanup(ctx);
unlink(path_z); unlink(path_b); unlink(path_a); unlink(path_deep);
rmdir(deeper); rmdir(subdir); rmdir(base_dir);
return -1;
}
// Hash again - should produce identical result (deterministic)
char* hash2 = fh_hash_directory(ctx, base_dir);
if (!hash2 || strcmp(hash1, hash2) != 0) {
fh_free_string(hash1);
fh_cleanup(ctx);
unlink(path_z); unlink(path_b); unlink(path_a); unlink(path_deep);
rmdir(deeper); rmdir(subdir); rmdir(base_dir);
return -1;
}
// Cleanup
fh_free_string(hash1);
fh_free_string(hash2);
fh_cleanup(ctx);
// Remove test files
unlink(path_deep);
unlink(path_a);
unlink(path_b);
unlink(path_z);
rmdir(deeper);
rmdir(subdir);
rmdir(base_dir);
return 0;
}
// Test: Empty nested directories
static int test_empty_nested_dirs() {
std::string cwd = get_cwd();
char base_dir[4096];
snprintf(base_dir, sizeof(base_dir), "%s/test_empty_XXXXXX", cwd.c_str());
if (mkdtemp(base_dir) == nullptr) return -1;
char empty_subdir[4096];
snprintf(empty_subdir, sizeof(empty_subdir), "%s/empty_sub", base_dir);
if (mkdir(empty_subdir, 0755) != 0) {
rmdir(base_dir);
return -1;
}
char path[4096];
snprintf(path, sizeof(path), "%s/only_file.txt", base_dir);
if (create_file(path, "content") != 0) {
rmdir(empty_subdir);
rmdir(base_dir);
return -1;
}
fh_context_t* ctx = fh_init(0);
if (!ctx) {
unlink(path);
rmdir(empty_subdir);
rmdir(base_dir);
return -1;
}
char* hash = fh_hash_directory(ctx, base_dir);
if (!hash || strlen(hash) != 64) {
fh_cleanup(ctx);
unlink(path);
rmdir(empty_subdir);
rmdir(base_dir);
return -1;
}
fh_free_string(hash);
fh_cleanup(ctx);
unlink(path);
rmdir(empty_subdir);
rmdir(base_dir);
return 0;
}
int main() {
printf("Testing recursive dataset hashing...\n");
if (test_recursive_hashing() != 0) {
printf("FAILED\n");
return 1;
}
if (test_empty_nested_dirs() != 0) {
printf("FAILED\n");
return 1;
}
printf("All recursive dataset tests passed.\n");
return 0;
}