Security Fixes: - CVE-2024-45339: Add O_EXCL flag to temp file creation in storage_write_entries() Prevents symlink attacks on predictable .tmp file paths - CVE-2025-47290: Use openat_nofollow() in storage_open() Closes TOCTOU race condition via path_sanitizer infrastructure - CVE-2025-0838: Add MAX_BATCH_SIZE=10000 to add_tasks() Prevents integer overflow in batch operations Research Trustworthiness (dataset_hash): - Deterministic file ordering: std::sort after collect_files() - Recursive directory traversal: depth-limited with cycle detection - Documented exclusions: hidden files and special files noted in API Bug Fixes: - R1: storage_init path validation for non-existent directories - R2: safe_strncpy return value check before strcat - R3: parallel_hash 256-file cap replaced with std::vector - R4: wire qi_compact_index/qi_rebuild_index stubs - R5: CompletionLatch race condition fix (hold mutex during decrement) - R6: ARMv8 SHA256 transform fix (save abcd_pre before vsha256hq_u32) - R7: fuzz_index_storage header format fix - R8: enforce null termination in add_tasks/update_tasks - R9: use 64 bytes (not 65) in combined hash to exclude null terminator - R10: status field persistence in save() New Tests: - test_recursive_dataset.cpp: Verify deterministic recursive hashing - test_storage_symlink_resistance.cpp: Verify CVE-2024-45339 fix - test_queue_index_batch_limit.cpp: Verify CVE-2025-0838 fix - test_sha256_arm_kat.cpp: ARMv8 known-answer tests - test_storage_init_new_dir.cpp: F1 verification - test_parallel_hash_large_dir.cpp: F3 verification - test_queue_index_compact.cpp: F4 verification All 8 native tests passing. Library ready for research lab deployment.
92 lines
3.1 KiB
C
92 lines
3.1 KiB
C
#ifndef DATASET_HASH_H
|
|
#define DATASET_HASH_H
|
|
|
|
#include <stddef.h>
|
|
#include <stdint.h>
|
|
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
// Opaque handle for hash context
|
|
typedef struct fh_context fh_context_t;
|
|
|
|
// Initialize hash context with thread pool
|
|
// num_threads: 0 = auto-detect (use number of CPU cores, capped at 8)
|
|
fh_context_t* fh_init(uint32_t num_threads);
|
|
|
|
// Cleanup context
|
|
void fh_cleanup(fh_context_t* ctx);
|
|
|
|
// Hash a single file (mmap + SIMD SHA256)
|
|
// Returns: hex string (caller frees with fh_free_string)
|
|
// Note: For batch operations, use fh_hash_directory_batch to amortize CGo overhead
|
|
char* fh_hash_file(fh_context_t* ctx, const char* path);
|
|
|
|
// Hash a directory's contents recursively and deterministically.
|
|
//
|
|
// The hash is computed over:
|
|
// - All regular files (S_ISREG) in the directory tree
|
|
// - Recursively traverses subdirectories (max depth 32)
|
|
// - Sorted lexicographically by full path for reproducibility
|
|
// - Excludes hidden files (names starting with '.')
|
|
// - Excludes symlinks, devices, and special files
|
|
//
|
|
// The combined hash is SHA256(SHA256(file1) + SHA256(file2) + ...)
|
|
// where files are processed in lexicographically sorted order.
|
|
//
|
|
// Returns: hex string (caller frees with fh_free_string), or NULL on error
|
|
char* fh_hash_directory(fh_context_t* ctx, const char* path);
|
|
|
|
// Batch hash multiple files (single CGo call for entire batch)
|
|
// paths: array of file paths
|
|
// count: number of paths
|
|
// out_hashes: pre-allocated array of 65-char buffers (64 hex + null terminator)
|
|
// Returns: 0 on success, -1 on error
|
|
int fh_hash_batch(fh_context_t* ctx, const char** paths, uint32_t count, char** out_hashes);
|
|
|
|
// Hash directory with batch output (get individual file hashes)
|
|
// out_hashes: pre-allocated array of 65-char buffers
|
|
// out_paths: optional array of path buffers (can be NULL)
|
|
// max_results: size of output arrays
|
|
// out_count: actual number of files hashed
|
|
// Returns: 0 on success, -1 on error
|
|
int fh_hash_directory_batch(
|
|
fh_context_t* ctx,
|
|
const char* dir_path,
|
|
char** out_hashes,
|
|
char** out_paths,
|
|
uint32_t max_results,
|
|
uint32_t* out_count
|
|
);
|
|
|
|
// Simple combined hash (single CGo call, single result)
|
|
// Best for: quick directory hash verification
|
|
// Returns: hex string (caller frees with fh_free_string)
|
|
char* fh_hash_directory_combined(fh_context_t* ctx, const char* dir_path);
|
|
|
|
// Free string returned by library
|
|
void fh_free_string(char* str);
|
|
|
|
// Constant-time hash comparison (prevents timing attacks)
|
|
// Returns: 1 if hashes are equal, 0 if not equal
|
|
// Timing is independent of the content (constant-time)
|
|
int fh_hashes_equal(const char* hash_a, const char* hash_b);
|
|
|
|
// Error handling
|
|
const char* fh_last_error(fh_context_t* ctx);
|
|
void fh_clear_error(fh_context_t* ctx);
|
|
|
|
// Configuration
|
|
void fh_set_buffer_size(fh_context_t* ctx, size_t buffer_size);
|
|
size_t fh_get_buffer_size(fh_context_t* ctx);
|
|
|
|
// SIMD detection
|
|
int fh_has_simd_sha256(void); // Returns 1 if SIMD SHA256 available, 0 otherwise
|
|
const char* fh_get_simd_impl_name(void); // Returns "SHA-NI", "ARMv8", or "generic"
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif
|
|
|
|
#endif // DATASET_HASH_H
|