fetch_ml/native/dataset_hash/threading/parallel_hash.cpp
Jeremie Fraeys 43d241c28d
feat: implement C++ native libraries for performance-critical operations
- Add arena allocator for zero-allocation hot paths
- Add thread pool for parallel operations
- Add mmap utilities for memory-mapped I/O
- Implement queue_index with heap-based priority queue
- Implement dataset_hash with SIMD support (SHA-NI, ARMv8)
- Add runtime SIMD detection for cross-platform correctness
- Add comprehensive tests and benchmarks
2026-02-16 20:38:04 -05:00

133 lines
3.7 KiB
C++

#include "parallel_hash.h"
#include "../io/file_hash.h"
#include "../crypto/sha256_hasher.h"
#include <dirent.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
// Simple file collector - just flat directory for now
static int collect_files(const char* dir_path, char** out_paths, int max_files) {
DIR* dir = opendir(dir_path);
if (!dir) return 0;
int count = 0;
struct dirent* entry;
while ((entry = readdir(dir)) != NULL && count < max_files) {
if (entry->d_name[0] == '.') continue; // Skip hidden
char full_path[4096];
snprintf(full_path, sizeof(full_path), "%s/%s", dir_path, entry->d_name);
struct stat st;
if (stat(full_path, &st) == 0 && S_ISREG(st.st_mode)) {
if (out_paths) {
strncpy(out_paths[count], full_path, 4095);
out_paths[count][4095] = '\0';
}
count++;
}
}
closedir(dir);
return count;
}
int parallel_hasher_init(ParallelHasher* hasher, uint32_t num_threads, size_t buffer_size) {
if (!hasher) return 0;
hasher->buffer_size = buffer_size;
hasher->pool = (ThreadPool*)malloc(sizeof(ThreadPool));
if (!hasher->pool) return 0;
if (num_threads == 0) {
num_threads = ThreadPool::default_thread_count();
}
new (hasher->pool) ThreadPool(num_threads);
return 1;
}
void parallel_hasher_cleanup(ParallelHasher* hasher) {
if (!hasher || !hasher->pool) return;
hasher->pool->~ThreadPool();
free(hasher->pool);
hasher->pool = nullptr;
}
int parallel_hash_directory(ParallelHasher* hasher, const char* path, char* out_hash) {
if (!hasher || !path || !out_hash) return -1;
// Collect files
char paths[256][4096];
char* path_ptrs[256];
for (int i = 0; i < 256; i++) path_ptrs[i] = paths[i];
int count = collect_files(path, path_ptrs, 256);
if (count == 0) {
// Empty directory - hash empty string
Sha256State st;
sha256_init(&st);
uint8_t result[32];
sha256_finalize(&st, result);
// Convert to hex
static const char hex[] = "0123456789abcdef";
for (int i = 0; i < 32; i++) {
out_hash[i*2] = hex[(result[i] >> 4) & 0xf];
out_hash[i*2+1] = hex[result[i] & 0xf];
}
out_hash[64] = '\0';
return 0;
}
// Hash all files
char hashes[256][65];
for (int i = 0; i < count; i++) {
if (hash_file(paths[i], hasher->buffer_size, hashes[i]) != 0) {
return -1;
}
}
// Combine hashes
Sha256State st;
sha256_init(&st);
for (int i = 0; i < count; i++) {
sha256_update(&st, (uint8_t*)hashes[i], strlen(hashes[i]));
}
uint8_t result[32];
sha256_finalize(&st, result);
// Convert to hex
static const char hex[] = "0123456789abcdef";
for (int i = 0; i < 32; i++) {
out_hash[i*2] = hex[(result[i] >> 4) & 0xf];
out_hash[i*2+1] = hex[result[i] & 0xf];
}
out_hash[64] = '\0';
return 0;
}
int parallel_hash_directory_batch(
ParallelHasher* hasher,
const char* path,
char** out_hashes,
char** out_paths,
uint32_t max_results,
uint32_t* out_count) {
if (!hasher || !path || !out_hashes) return -1;
// Collect files
int count = collect_files(path, out_paths, (int)max_results);
if (out_count) *out_count = (uint32_t)count;
// Hash each file
for (int i = 0; i < count; i++) {
if (hash_file(out_paths ? out_paths[i] : nullptr, hasher->buffer_size, out_hashes[i]) != 0) {
out_hashes[i][0] = '\0';
}
}
return 0;
}