fetch_ml/native/artifact_scanner/artifact_scanner.cpp
Jeremie Fraeys d408a60eb1
Some checks failed
Documentation / build-and-publish (push) Waiting to run
Test / test (push) Waiting to run
Checkout test / test (push) Successful in 5s
CI with Native Libraries / test-native (push) Has been cancelled
CI with Native Libraries / build-release (push) Has been cancelled
ci: push all workflow updates
2026-02-12 13:28:15 -05:00

163 lines
5.1 KiB
C++

#include "artifact_scanner.h"
#include <chrono>
#include <cstring>
#include <dirent.h>
#include <fcntl.h>
#include <fnmatch.h>
#include <filesystem>
#include <sys/stat.h>
#include <unistd.h>
#include <vector>
namespace fs = std::filesystem;
struct as_scanner {
std::vector<std::string> exclude_patterns;
std::string last_error;
uint64_t scan_count = 0;
};
// Check if path matches any exclude pattern
static bool should_exclude(as_scanner_t* scanner, const char* path) {
for (const auto& pattern : scanner->exclude_patterns) {
if (fnmatch(pattern.c_str(), path, FNM_PATHNAME) == 0) {
return true;
}
}
return false;
}
// Platform-optimized directory traversal
// Uses simple but efficient approach: batch readdir + minimal stat calls
#ifdef __linux__
// On Linux, we could use getdents64 for even better performance
// But standard readdir is fine for now and more portable
#endif
as_scanner_t* as_create(const char** exclude_patterns, size_t pattern_count) {
auto* scanner = new as_scanner_t;
for (size_t i = 0; i < pattern_count; ++i) {
if (exclude_patterns[i]) {
scanner->exclude_patterns.push_back(exclude_patterns[i]);
}
}
// Default excludes
scanner->exclude_patterns.push_back("run_manifest.json");
scanner->exclude_patterns.push_back("output.log");
scanner->exclude_patterns.push_back("code/*");
scanner->exclude_patterns.push_back("snapshot/*");
return scanner;
}
void as_destroy(as_scanner_t* scanner) {
delete scanner;
}
void as_add_exclude(as_scanner_t* scanner, const char* pattern) {
if (scanner && pattern) {
scanner->exclude_patterns.push_back(pattern);
}
}
// Fast directory scan using modern C++ filesystem (which uses optimal syscalls internally)
as_result_t* as_scan_directory(as_scanner_t* scanner, const char* run_dir) {
if (!scanner || !run_dir) return nullptr;
auto start_time = std::chrono::steady_clock::now();
as_result_t* result = new as_result_t;
result->artifacts = nullptr;
result->count = 0;
result->total_size = 0;
result->discovery_time_ms = 0;
std::vector<as_artifact_t> artifacts;
artifacts.reserve(128); // Pre-allocate to avoid reallocations
try {
fs::path root(run_dir);
// Use recursive_directory_iterator with optimized options
// skip_permission_denied prevents exceptions on permission errors
auto options = fs::directory_options::skip_permission_denied;
for (const auto& entry : fs::recursive_directory_iterator(root, options)) {
scanner->scan_count++;
if (!entry.is_regular_file()) {
continue;
}
// Get relative path
fs::path rel_path = fs::relative(entry.path(), root);
std::string rel_str = rel_path.string();
// Check exclusions
if (should_exclude(scanner, rel_str.c_str())) {
continue;
}
// Get file info
as_artifact_t artifact;
std::strncpy(artifact.path, rel_str.c_str(), sizeof(artifact.path) - 1);
artifact.path[sizeof(artifact.path) - 1] = '\0';
auto status = entry.status();
artifact.size_bytes = entry.file_size();
auto mtime = fs::last_write_time(entry);
// Convert to Unix timestamp (approximate)
auto sctp = std::chrono::time_point_cast<std::chrono::system_clock::duration>(
mtime - fs::file_time_type::clock::now() + std::chrono::system_clock::now()
);
artifact.mtime = std::chrono::system_clock::to_time_t(sctp);
artifact.mode = static_cast<uint32_t>(status.permissions());
artifacts.push_back(artifact);
result->total_size += artifact.size_bytes;
}
} catch (const std::exception& e) {
scanner->last_error = e.what();
delete result;
return nullptr;
}
// Sort artifacts by path for deterministic order
std::sort(artifacts.begin(), artifacts.end(), [](const as_artifact_t& a, const as_artifact_t& b) {
return std::strcmp(a.path, b.path) < 0;
});
// Copy to result
result->count = artifacts.size();
if (result->count > 0) {
result->artifacts = new as_artifact_t[result->count];
std::memcpy(result->artifacts, artifacts.data(), result->count * sizeof(as_artifact_t));
}
auto end_time = std::chrono::steady_clock::now();
result->discovery_time_ms = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
return result;
}
void as_free_result(as_result_t* result) {
if (result) {
delete[] result->artifacts;
delete result;
}
}
const char* as_last_error(as_scanner_t* scanner) {
if (!scanner || scanner->last_error.empty()) return nullptr;
return scanner->last_error.c_str();
}
uint64_t as_get_scan_count(as_scanner_t* scanner) {
return scanner ? scanner->scan_count : 0;
}