fetch_ml/native/dataset_hash/dataset_hash.cpp
Jeremie Fraeys 43d241c28d
feat: implement C++ native libraries for performance-critical operations
- Add arena allocator for zero-allocation hot paths
- Add thread pool for parallel operations
- Add mmap utilities for memory-mapped I/O
- Implement queue_index with heap-based priority queue
- Implement dataset_hash with SIMD support (SHA-NI, ARMv8)
- Add runtime SIMD detection for cross-platform correctness
- Add comprehensive tests and benchmarks
2026-02-16 20:38:04 -05:00

127 lines
3.2 KiB
C++

// dataset_hash.cpp - C API implementation using C-style internals
#include "dataset_hash.h"
#include "crypto/sha256_hasher.h"
#include "io/file_hash.h"
#include "threading/parallel_hash.h"
#include <cstring>
#include <stdlib.h>
// Context structure - simple C-style
struct fh_context {
ParallelHasher hasher;
size_t buffer_size;
char last_error[256];
};
fh_context_t* fh_init(uint32_t num_threads) {
auto* ctx = (fh_context_t*)malloc(sizeof(fh_context_t));
if (!ctx) return nullptr;
ctx->buffer_size = 64 * 1024;
ctx->last_error[0] = '\0';
if (!parallel_hasher_init(&ctx->hasher, num_threads, ctx->buffer_size)) {
free(ctx);
return nullptr;
}
return ctx;
}
void fh_cleanup(fh_context_t* ctx) {
if (ctx) {
parallel_hasher_cleanup(&ctx->hasher);
free(ctx);
}
}
char* fh_hash_file(fh_context_t* ctx, const char* path) {
if (!ctx || !path) return nullptr;
char hash[65];
if (hash_file(path, ctx->buffer_size, hash) != 0) {
strncpy(ctx->last_error, "Failed to hash file", sizeof(ctx->last_error) - 1);
ctx->last_error[sizeof(ctx->last_error) - 1] = '\0';
return nullptr;
}
char* result = (char*)malloc(65);
if (result) {
memcpy(result, hash, 65);
}
return result;
}
char* fh_hash_directory(fh_context_t* ctx, const char* path) {
if (!ctx || !path) return nullptr;
char* result = (char*)malloc(65);
if (!result) return nullptr;
if (parallel_hash_directory(&ctx->hasher, path, result) != 0) {
free(result);
strncpy(ctx->last_error, "Failed to hash directory", sizeof(ctx->last_error) - 1);
ctx->last_error[sizeof(ctx->last_error) - 1] = '\0';
return nullptr;
}
return result;
}
int fh_hash_batch(fh_context_t* ctx, const char** paths, uint32_t count, char** out_hashes) {
if (!ctx || !paths || !out_hashes || count == 0) return -1;
return hash_files_batch(paths, count, out_hashes, ctx->buffer_size);
}
int fh_hash_directory_batch(
fh_context_t* ctx,
const char* dir_path,
char** out_hashes,
char** out_paths,
uint32_t max_results,
uint32_t* out_count) {
if (!ctx || !dir_path || !out_hashes) return -1;
return parallel_hash_directory_batch(&ctx->hasher, dir_path, out_hashes, out_paths,
max_results, out_count);
}
char* fh_hash_directory_combined(fh_context_t* ctx, const char* dir_path) {
return fh_hash_directory(ctx, dir_path);
}
void fh_free_string(char* str) {
free(str);
}
const char* fh_last_error(fh_context_t* ctx) {
if (!ctx || !ctx->last_error[0]) return nullptr;
return ctx->last_error;
}
void fh_clear_error(fh_context_t* ctx) {
if (ctx) {
ctx->last_error[0] = '\0';
}
}
void fh_set_buffer_size(fh_context_t* ctx, size_t buffer_size) {
if (ctx) {
ctx->buffer_size = buffer_size;
}
}
size_t fh_get_buffer_size(fh_context_t* ctx) {
return ctx ? ctx->buffer_size : 0;
}
int fh_has_simd_sha256(void) {
return sha256_has_hardware_accel();
}
const char* fh_get_simd_impl_name(void) {
return sha256_impl_name();
}