fetch_ml/internal/manifest/validator.go
Jeremie Fraeys 260e18499e
feat: Research features - narrative fields and outcome tracking
Add comprehensive research context tracking to jobs:
- Narrative fields: hypothesis, context, intent, expected_outcome
- Experiment groups and tags for organization
- Run comparison (compare command) for diff analysis
- Run search (find command) with criteria filtering
- Run export (export command) for data portability
- Outcome setting (outcome command) for experiment validation

Update queue and requeue commands to support narrative fields.
Add narrative validation to manifest validator.
Add WebSocket handlers for compare, find, export, and outcome operations.

Includes E2E tests for phase 2 features.
2026-02-18 21:27:05 -05:00

301 lines
7.8 KiB
Go

package manifest
import (
"errors"
"fmt"
"strings"
"github.com/jfraeys/fetch_ml/internal/privacy"
)
// ErrIncompleteManifest is returned when a required manifest field is missing.
var ErrIncompleteManifest = errors.New("incomplete manifest")
// Validator validates that a RunManifest is complete before execution.
type Validator struct {
requiredFields []string
}
// NewValidator creates a new manifest validator with default required fields.
func NewValidator() *Validator {
return &Validator{
requiredFields: []string{
"commit_id",
"deps_manifest_sha256",
},
}
}
// NewValidatorWithFields creates a validator with custom required fields.
func NewValidatorWithFields(fields []string) *Validator {
return &Validator{
requiredFields: fields,
}
}
// ValidationError contains details about a validation failure.
type ValidationError struct {
Field string `json:"field"`
Message string `json:"message"`
}
// Error returns the error string.
func (e ValidationError) Error() string {
return fmt.Sprintf("validation error for field '%s': %s", e.Field, e.Message)
}
// Validate checks that the manifest has all required fields.
// Returns an error listing all missing fields.
func (v *Validator) Validate(m *RunManifest) error {
if m == nil {
return fmt.Errorf("manifest is nil: %w", ErrIncompleteManifest)
}
var validationErrors []ValidationError
for _, field := range v.requiredFields {
if err := v.validateField(m, field); err != nil {
validationErrors = append(validationErrors, *err)
}
}
if len(validationErrors) > 0 {
// Build comprehensive error message
msg := "manifest validation failed:\n"
for _, err := range validationErrors {
msg += fmt.Sprintf(" - %s\n", err.Error())
}
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
}
return nil
}
// ValidateStrict fails if ANY optional fields commonly used for provenance are missing.
// This is for high-assurance environments.
func (v *Validator) ValidateStrict(m *RunManifest) error {
if err := v.Validate(m); err != nil {
return err
}
// Additional strict checks
var strictErrors []ValidationError
if m.WorkerVersion == "" {
strictErrors = append(strictErrors, ValidationError{
Field: "worker_version",
Message: "required for strict provenance",
})
}
if m.PodmanImage == "" {
strictErrors = append(strictErrors, ValidationError{
Field: "podman_image",
Message: "required for strict provenance",
})
}
if len(strictErrors) > 0 {
msg := "strict manifest validation failed:\n"
for _, err := range strictErrors {
msg += fmt.Sprintf(" - %s\n", err.Error())
}
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
}
return nil
}
// validateField checks a single required field.
func (v *Validator) validateField(m *RunManifest, field string) *ValidationError {
switch field {
case "commit_id":
if m.CommitID == "" {
return &ValidationError{
Field: field,
Message: "commit_id is required for code provenance",
}
}
case "deps_manifest_sha256":
if m.DepsManifestSHA == "" {
return &ValidationError{
Field: field,
Message: "deps_manifest_sha256 is required for dependency provenance",
}
}
case "run_id":
if m.RunID == "" {
return &ValidationError{
Field: field,
Message: "run_id is required",
}
}
case "task_id":
if m.TaskID == "" {
return &ValidationError{
Field: field,
Message: "task_id is required",
}
}
case "job_name":
if m.JobName == "" {
return &ValidationError{
Field: field,
Message: "job_name is required",
}
}
case "snapshot_sha256":
if m.SnapshotID != "" && m.SnapshotSHA256 == "" {
return &ValidationError{
Field: field,
Message: "snapshot_sha256 is required when snapshot_id is provided",
}
}
}
return nil
}
// IsValidationError checks if an error is a manifest validation error.
func IsValidationError(err error) bool {
return errors.Is(err, ErrIncompleteManifest)
}
// NarrativeValidation contains validation results.
type NarrativeValidation struct {
Warnings []string `json:"warnings,omitempty"`
Errors []string `json:"errors,omitempty"`
PIIFindings []privacy.PIIFinding `json:"pii_findings,omitempty"`
}
// OutcomeValidation contains validation results.
type OutcomeValidation struct {
Warnings []string `json:"warnings,omitempty"`
Errors []string `json:"errors,omitempty"`
}
// Valid outcome statuses.
var ValidOutcomeStatuses = []string{
"validated", "invalidated", "inconclusive", "partial", "",
}
// isValidOutcomeStatus checks if status is valid.
func isValidOutcomeStatus(status string) bool {
for _, s := range ValidOutcomeStatuses {
if s == status {
return true
}
}
return false
}
// ValidateNarrative validates a Narrative struct.
func ValidateNarrative(n *Narrative) NarrativeValidation {
result := NarrativeValidation{
Warnings: make([]string, 0),
Errors: make([]string, 0),
}
if n == nil {
return result
}
// Validate hypothesis length
if len(n.Hypothesis) > 5000 {
result.Errors = append(result.Errors, "hypothesis exceeds 5000 characters")
} else if len(n.Hypothesis) > 1000 {
result.Warnings = append(result.Warnings, "hypothesis is very long (>1000 chars)")
}
// Validate context length
if len(n.Context) > 10000 {
result.Errors = append(result.Errors, "context exceeds 10000 characters")
}
// Validate tags count
if len(n.Tags) > 50 {
result.Errors = append(result.Errors, "too many tags (max 50)")
} else if len(n.Tags) > 20 {
result.Warnings = append(result.Warnings, "many tags (>20)")
}
// Validate tag lengths
for i, tag := range n.Tags {
if len(tag) > 50 {
result.Errors = append(result.Errors, fmt.Sprintf("tag %d exceeds 50 characters", i))
}
if strings.ContainsAny(tag, ",;|/\\") {
result.Warnings = append(result.Warnings, fmt.Sprintf("tag %d contains special characters", i))
}
}
// Check for PII in text fields
fields := map[string]string{
"hypothesis": n.Hypothesis,
"context": n.Context,
"intent": n.Intent,
}
for fieldName, text := range fields {
if findings := privacy.DetectPII(text); len(findings) > 0 {
result.PIIFindings = append(result.PIIFindings, findings...)
result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in %s field", fieldName))
}
}
return result
}
// ValidateOutcome validates an Outcome struct.
func ValidateOutcome(o *Outcome) OutcomeValidation {
result := OutcomeValidation{
Warnings: make([]string, 0),
Errors: make([]string, 0),
}
if o == nil {
return result
}
// Validate status
if !isValidOutcomeStatus(o.Status) {
result.Errors = append(result.Errors, fmt.Sprintf("invalid status: %s (must be validated, invalidated, inconclusive, partial, or empty)", o.Status))
}
// Validate summary length
if len(o.Summary) > 1000 {
result.Errors = append(result.Errors, "summary exceeds 1000 characters")
} else if len(o.Summary) > 200 {
result.Warnings = append(result.Warnings, "summary is long (>200 chars)")
}
// Validate key learnings count
if len(o.KeyLearnings) > 5 {
result.Errors = append(result.Errors, "too many key learnings (max 5)")
}
// Validate key learning lengths
for i, learning := range o.KeyLearnings {
if len(learning) > 500 {
result.Errors = append(result.Errors, fmt.Sprintf("key learning %d exceeds 500 characters", i))
}
}
// Validate follow-up runs references
if len(o.FollowUpRuns) > 10 {
result.Warnings = append(result.Warnings, "many follow-up runs (>10)")
}
// Check for PII in text fields
if findings := privacy.DetectPII(o.Summary); len(findings) > 0 {
result.Warnings = append(result.Warnings, "potential PII detected in summary")
}
for i, learning := range o.KeyLearnings {
if findings := privacy.DetectPII(learning); len(findings) > 0 {
result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in key learning %d", i))
}
}
return result
}