fetch_ml/native_rust/dataset_hash/tests/integration.rs
Jeremie Fraeys 6949287fb3
feat(native_rust): implement BLAKE3 dataset_hash and priority queue_index
Implements two production-ready Rust native libraries:

## dataset_hash (BLAKE3-based hashing)
- FFI exports: ds_hash_file, ds_hash_directory_batch, ds_hash_directory_combined
- BLAKE3 hashing for files and directory trees
- Hidden file filtering (respects .hidden and _prefix files)
- Prometheus-compatible metrics export
- Comprehensive integration tests (12 tests)
- Benchmarks: hash_file_1kb (~14µs), hash_file_1mb (~610µs), dir_100files (~1.6ms)

## queue_index (priority queue)
- FFI exports: 25+ functions matching C++ API
  - Lifecycle: qi_open, qi_close
  - Task ops: add_tasks, update_tasks, remove_tasks, get_task_by_id
  - Queue ops: get_next_batch, peek_next, mark_completed
  - Priority: get_next_priority_task, peek_priority_task
  - Query: get_all_tasks, get_tasks_by_status, get_task_count
  - Retry/DLQ: retry_task, move_to_dlq
  - Lease: renew_lease, release_lease
  - Maintenance: rebuild_index, compact_index
- BinaryHeap-based priority queue with correct Ord (max-heap)
- Memory-mapped storage with safe Rust wrappers
- Panic-safe FFI boundaries using catch_unwind
- Comprehensive integration tests (7 tests, 1 ignored for persistence)
- Benchmarks: add_100 (~60µs), get_10 (~24ns), priority (~5µs)

## Architecture
- Cargo workspace with shared common crate
- Criterion benchmarks for both crates
- Rust 1.85.0 toolchain pinned
- Zero compiler warnings
- All 19 tests passing

Compare: make compare-benchmarks (Rust/Go/C++ comparison)
2026-03-23 12:52:13 -04:00

177 lines
5.2 KiB
Rust

//! Integration tests for dataset_hash
//!
//! Tests file hashing with various file sizes and edge cases.
use std::fs;
use tempfile::TempDir;
// Import the crate
use dataset_hash::{collect_files, hash_directory_batch, hash_directory_combined, hash_file};
#[test]
fn test_hash_file_basic() {
let temp = TempDir::new().unwrap();
let file_path = temp.path().join("test.txt");
fs::write(&file_path, "hello world").unwrap();
let hash1 = hash_file(&file_path).unwrap();
let hash2 = hash_file(&file_path).unwrap();
// Hash should be deterministic
assert_eq!(hash1, hash2);
// BLAKE3 produces 64-char hex strings
assert_eq!(hash1.len(), 64);
}
#[test]
fn test_hash_file_empty() {
let temp = TempDir::new().unwrap();
let file_path = temp.path().join("empty.txt");
fs::write(&file_path, "").unwrap();
let hash = hash_file(&file_path).unwrap();
assert_eq!(hash.len(), 64);
}
#[test]
fn test_hash_file_large() {
let temp = TempDir::new().unwrap();
let file_path = temp.path().join("large.bin");
// Create 10MB file
let data = vec![0u8; 10 * 1024 * 1024];
fs::write(&file_path, &data).unwrap();
let hash = hash_file(&file_path).unwrap();
assert_eq!(hash.len(), 64);
}
#[test]
fn test_hash_file_different_content() {
let temp = TempDir::new().unwrap();
let file1 = temp.path().join("file1.txt");
let file2 = temp.path().join("file2.txt");
fs::write(&file1, "content A").unwrap();
fs::write(&file2, "content B").unwrap();
let hash1 = hash_file(&file1).unwrap();
let hash2 = hash_file(&file2).unwrap();
assert_ne!(hash1, hash2);
}
#[test]
fn test_collect_files_excludes_hidden() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("visible.txt"), "data").unwrap();
fs::write(temp.path().join(".hidden"), "data").unwrap();
// Create hidden directory and file inside it
let hidden_dir = temp.path().join(".hidden_dir");
fs::create_dir(&hidden_dir).unwrap();
fs::write(hidden_dir.join("file.txt"), "data").unwrap();
let files = collect_files(temp.path()).unwrap();
assert_eq!(files.len(), 1);
assert!(files[0].file_name().unwrap() == "visible.txt");
}
#[test]
fn test_collect_files_sorted() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("z.txt"), "z").unwrap();
fs::write(temp.path().join("a.txt"), "a").unwrap();
fs::write(temp.path().join("m.txt"), "m").unwrap();
let files = collect_files(temp.path()).unwrap();
assert_eq!(files.len(), 3);
assert!(files[0].file_name().unwrap() == "a.txt");
assert!(files[1].file_name().unwrap() == "m.txt");
assert!(files[2].file_name().unwrap() == "z.txt");
}
#[test]
fn test_hash_directory_batch() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("a.txt"), "AAA").unwrap();
fs::write(temp.path().join("b.txt"), "BBB").unwrap();
let pairs = hash_directory_batch(temp.path()).unwrap();
assert_eq!(pairs.len(), 2);
// Verify each file has a hash
for (path, hash) in &pairs {
assert!(path.ends_with(".txt"));
assert_eq!(hash.len(), 64);
}
}
#[test]
fn test_hash_directory_combined() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("a.txt"), "AAA").unwrap();
fs::write(temp.path().join("b.txt"), "BBB").unwrap();
let hash1 = hash_directory_combined(temp.path()).unwrap();
let hash2 = hash_directory_combined(temp.path()).unwrap();
// Combined hash should be deterministic
assert_eq!(hash1, hash2);
assert_eq!(hash1.len(), 64);
}
#[test]
fn test_hash_directory_combined_changes_with_content() {
let temp = TempDir::new().unwrap();
fs::write(temp.path().join("file.txt"), "content").unwrap();
let hash1 = hash_directory_combined(temp.path()).unwrap();
// Modify file
fs::write(temp.path().join("file.txt"), "modified").unwrap();
let hash2 = hash_directory_combined(temp.path()).unwrap();
assert_ne!(hash1, hash2);
}
#[test]
fn test_collect_files_no_symlinks() {
let temp = TempDir::new().unwrap();
let real_file = temp.path().join("real.txt");
let symlink = temp.path().join("link.txt");
fs::write(&real_file, "data").unwrap();
#[cfg(unix)]
std::os::unix::fs::symlink(&real_file, &symlink).unwrap();
let files = collect_files(temp.path()).unwrap();
// Should only include the real file, not the symlink
assert_eq!(files.len(), 1);
}
#[test]
fn test_hash_directory_empty() {
let temp = TempDir::new().unwrap();
let pairs = hash_directory_batch(temp.path()).unwrap();
assert!(pairs.is_empty());
let combined = hash_directory_combined(temp.path()).unwrap();
// Empty directory should still produce a valid hash
assert_eq!(combined.len(), 64);
}
#[test]
fn test_hash_directory_nested() {
let temp = TempDir::new().unwrap();
let subdir = temp.path().join("subdir");
fs::create_dir(&subdir).unwrap();
fs::write(temp.path().join("root.txt"), "root").unwrap();
fs::write(subdir.join("nested.txt"), "nested").unwrap();
let pairs = hash_directory_batch(temp.path()).unwrap();
assert_eq!(pairs.len(), 2);
}