Security Fixes: - CVE-2024-45339: Add O_EXCL flag to temp file creation in storage_write_entries() Prevents symlink attacks on predictable .tmp file paths - CVE-2025-47290: Use openat_nofollow() in storage_open() Closes TOCTOU race condition via path_sanitizer infrastructure - CVE-2025-0838: Add MAX_BATCH_SIZE=10000 to add_tasks() Prevents integer overflow in batch operations Research Trustworthiness (dataset_hash): - Deterministic file ordering: std::sort after collect_files() - Recursive directory traversal: depth-limited with cycle detection - Documented exclusions: hidden files and special files noted in API Bug Fixes: - R1: storage_init path validation for non-existent directories - R2: safe_strncpy return value check before strcat - R3: parallel_hash 256-file cap replaced with std::vector - R4: wire qi_compact_index/qi_rebuild_index stubs - R5: CompletionLatch race condition fix (hold mutex during decrement) - R6: ARMv8 SHA256 transform fix (save abcd_pre before vsha256hq_u32) - R7: fuzz_index_storage header format fix - R8: enforce null termination in add_tasks/update_tasks - R9: use 64 bytes (not 65) in combined hash to exclude null terminator - R10: status field persistence in save() New Tests: - test_recursive_dataset.cpp: Verify deterministic recursive hashing - test_storage_symlink_resistance.cpp: Verify CVE-2024-45339 fix - test_queue_index_batch_limit.cpp: Verify CVE-2025-0838 fix - test_sha256_arm_kat.cpp: ARMv8 known-answer tests - test_storage_init_new_dir.cpp: F1 verification - test_parallel_hash_large_dir.cpp: F3 verification - test_queue_index_compact.cpp: F4 verification All 8 native tests passing. Library ready for research lab deployment.
135 lines
3.4 KiB
C++
135 lines
3.4 KiB
C++
// dataset_hash.cpp - C API implementation using C-style internals
|
|
#include "dataset_hash.h"
|
|
#include "crypto/sha256_hasher.h"
|
|
#include "io/file_hash.h"
|
|
#include "threading/parallel_hash.h"
|
|
#include "../common/include/secure_mem.h"
|
|
#include <cstring>
|
|
#include <stdlib.h>
|
|
|
|
using fetchml::common::safe_strncpy;
|
|
|
|
// Context structure - simple C-style
|
|
struct fh_context {
|
|
ParallelHasher hasher;
|
|
size_t buffer_size;
|
|
char last_error[256];
|
|
};
|
|
|
|
fh_context_t* fh_init(uint32_t num_threads) {
|
|
auto* ctx = (fh_context_t*)malloc(sizeof(fh_context_t));
|
|
if (!ctx) return nullptr;
|
|
|
|
ctx->buffer_size = 64 * 1024;
|
|
ctx->last_error[0] = '\0';
|
|
|
|
if (!parallel_hasher_init(&ctx->hasher, num_threads, ctx->buffer_size)) {
|
|
free(ctx);
|
|
return nullptr;
|
|
}
|
|
|
|
return ctx;
|
|
}
|
|
|
|
void fh_cleanup(fh_context_t* ctx) {
|
|
if (ctx) {
|
|
parallel_hasher_cleanup(&ctx->hasher);
|
|
free(ctx);
|
|
}
|
|
}
|
|
|
|
char* fh_hash_file(fh_context_t* ctx, const char* path) {
|
|
if (!ctx || !path) return nullptr;
|
|
|
|
char hash[65];
|
|
if (hash_file(path, ctx->buffer_size, hash) != 0) {
|
|
safe_strncpy(ctx->last_error, "Failed to hash file", sizeof(ctx->last_error));
|
|
return nullptr;
|
|
}
|
|
|
|
char* result = (char*)malloc(65);
|
|
if (result) {
|
|
memcpy(result, hash, 65);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
char* fh_hash_directory(fh_context_t* ctx, const char* path) {
|
|
if (!ctx || !path) return nullptr;
|
|
|
|
char* result = (char*)malloc(65);
|
|
if (!result) return nullptr;
|
|
|
|
if (parallel_hash_directory(&ctx->hasher, path, result) != 0) {
|
|
free(result);
|
|
safe_strncpy(ctx->last_error, "Failed to hash directory", sizeof(ctx->last_error));
|
|
return nullptr;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
int fh_hash_batch(fh_context_t* ctx, const char** paths, uint32_t count, char** out_hashes) {
|
|
if (!ctx || !paths || !out_hashes || count == 0) return -1;
|
|
|
|
return hash_files_batch(paths, count, out_hashes, ctx->buffer_size);
|
|
}
|
|
|
|
int fh_hash_directory_batch(
|
|
fh_context_t* ctx,
|
|
const char* dir_path,
|
|
char** out_hashes,
|
|
char** out_paths,
|
|
uint32_t max_results,
|
|
uint32_t* out_count) {
|
|
|
|
if (!ctx || !dir_path || !out_hashes) return -1;
|
|
|
|
return parallel_hash_directory_batch(&ctx->hasher, dir_path, out_hashes, out_paths,
|
|
max_results, out_count);
|
|
}
|
|
|
|
char* fh_hash_directory_combined(fh_context_t* ctx, const char* dir_path) {
|
|
return fh_hash_directory(ctx, dir_path);
|
|
}
|
|
|
|
void fh_free_string(char* str) {
|
|
free(str);
|
|
}
|
|
|
|
const char* fh_last_error(fh_context_t* ctx) {
|
|
if (!ctx || !ctx->last_error[0]) return nullptr;
|
|
return ctx->last_error;
|
|
}
|
|
|
|
void fh_clear_error(fh_context_t* ctx) {
|
|
if (ctx) {
|
|
ctx->last_error[0] = '\0';
|
|
}
|
|
}
|
|
|
|
void fh_set_buffer_size(fh_context_t* ctx, size_t buffer_size) {
|
|
if (ctx) {
|
|
ctx->buffer_size = buffer_size;
|
|
}
|
|
}
|
|
|
|
size_t fh_get_buffer_size(fh_context_t* ctx) {
|
|
return ctx ? ctx->buffer_size : 0;
|
|
}
|
|
|
|
int fh_has_simd_sha256(void) {
|
|
return sha256_has_hardware_accel();
|
|
}
|
|
|
|
const char* fh_get_simd_impl_name(void) {
|
|
return sha256_impl_name();
|
|
}
|
|
|
|
// Constant-time hash comparison
|
|
int fh_hashes_equal(const char* hash_a, const char* hash_b) {
|
|
if (!hash_a || !hash_b) return 0;
|
|
// SHA256 hex strings are always 64 characters
|
|
return fetchml::common::secure_memcmp(hash_a, hash_b, 64) == 0 ? 1 : 0;
|
|
}
|
|
|