fetch_ml/internal/manifest/validator.go

package manifest

import (
	"errors"
	"fmt"
	"strings"

	"github.com/jfraeys/fetch_ml/internal/privacy"
)

// ErrIncompleteManifest is returned when a required manifest field is missing.
var ErrIncompleteManifest = errors.New("incomplete manifest")

// Validator validates that a RunManifest is complete before execution.
type Validator struct {
	requiredFields []string
}

// NewValidator creates a new manifest validator with default required fields.
func NewValidator() *Validator {
	return &Validator{
		requiredFields: []string{
			"commit_id",
			"deps_manifest_sha256",
		},
	}
}

// NewValidatorWithFields creates a validator with custom required fields.
func NewValidatorWithFields(fields []string) *Validator {
	return &Validator{
		requiredFields: fields,
	}
}

// ValidationError contains details about a validation failure.
type ValidationError struct {
	Field   string `json:"field"`
	Message string `json:"message"`
}

// Error returns the error string.
func (e ValidationError) Error() string {
	return fmt.Sprintf("validation error for field '%s': %s", e.Field, e.Message)
}

// Validate checks that the manifest has all required fields.
// Returns an error listing all missing fields.
func (v *Validator) Validate(m *RunManifest) error {
	if m == nil {
		return fmt.Errorf("manifest is nil: %w", ErrIncompleteManifest)
	}

	var validationErrors []ValidationError

	for _, field := range v.requiredFields {
		if err := v.validateField(m, field); err != nil {
			validationErrors = append(validationErrors, *err)
		}
	}

	if len(validationErrors) > 0 {
		// Build comprehensive error message
		msg := "manifest validation failed:\n"
		for _, err := range validationErrors {
			msg += fmt.Sprintf("  - %s\n", err.Error())
		}
		return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
	}

	return nil
}

// ValidateStrict fails if ANY optional fields commonly used for provenance are missing.
// This is for high-assurance environments.
func (v *Validator) ValidateStrict(m *RunManifest) error {
	if err := v.Validate(m); err != nil {
		return err
	}

	// Additional strict checks
	var strictErrors []ValidationError

	if m.WorkerVersion == "" {
		strictErrors = append(strictErrors, ValidationError{
			Field:   "worker_version",
			Message: "required for strict provenance",
		})
	}

	if m.PodmanImage == "" {
		strictErrors = append(strictErrors, ValidationError{
			Field:   "podman_image",
			Message: "required for strict provenance",
		})
	}

	if len(strictErrors) > 0 {
		msg := "strict manifest validation failed:\n"
		for _, err := range strictErrors {
			msg += fmt.Sprintf("  - %s\n", err.Error())
		}
		return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
	}

	return nil
}

// validateField checks a single required field.
func (v *Validator) validateField(m *RunManifest, field string) *ValidationError {
	switch field {
	case "commit_id":
		if m.CommitID == "" {
			return &ValidationError{
				Field:   field,
				Message: "commit_id is required for code provenance",
			}
		}
	case "deps_manifest_sha256":
		if m.DepsManifestSHA == "" {
			return &ValidationError{
				Field:   field,
				Message: "deps_manifest_sha256 is required for dependency provenance",
			}
		}
	case "run_id":
		if m.RunID == "" {
			return &ValidationError{
				Field:   field,
				Message: "run_id is required",
			}
		}
	case "task_id":
		if m.TaskID == "" {
			return &ValidationError{
				Field:   field,
				Message: "task_id is required",
			}
		}
	case "job_name":
		if m.JobName == "" {
			return &ValidationError{
				Field:   field,
				Message: "job_name is required",
			}
		}
	case "snapshot_sha256":
		if m.SnapshotID != "" && m.SnapshotSHA256 == "" {
			return &ValidationError{
				Field:   field,
				Message: "snapshot_sha256 is required when snapshot_id is provided",
			}
		}
	}

	return nil
}

// IsValidationError checks if an error is a manifest validation error.
func IsValidationError(err error) bool {
	return errors.Is(err, ErrIncompleteManifest)
}

// NarrativeValidation contains validation results.
type NarrativeValidation struct {
	Warnings    []string             `json:"warnings,omitempty"`
	Errors      []string             `json:"errors,omitempty"`
	PIIFindings []privacy.PIIFinding `json:"pii_findings,omitempty"`
}

// OutcomeValidation contains validation results.
type OutcomeValidation struct {
	Warnings []string `json:"warnings,omitempty"`
	Errors   []string `json:"errors,omitempty"`
}

// Valid outcome statuses.
var ValidOutcomeStatuses = []string{
	"validated", "invalidated", "inconclusive", "partial", "",
}

// isValidOutcomeStatus checks if status is valid.
func isValidOutcomeStatus(status string) bool {
	for _, s := range ValidOutcomeStatuses {
		if s == status {
			return true
		}
	}
	return false
}

// ValidateNarrative validates a Narrative struct.
func ValidateNarrative(n *Narrative) NarrativeValidation {
	result := NarrativeValidation{
		Warnings: make([]string, 0),
		Errors:   make([]string, 0),
	}

	if n == nil {
		return result
	}

	// Validate hypothesis length
	if len(n.Hypothesis) > 5000 {
		result.Errors = append(result.Errors, "hypothesis exceeds 5000 characters")
	} else if len(n.Hypothesis) > 1000 {
		result.Warnings = append(result.Warnings, "hypothesis is very long (>1000 chars)")
	}

	// Validate context length
	if len(n.Context) > 10000 {
		result.Errors = append(result.Errors, "context exceeds 10000 characters")
	}

	// Validate tags count
	if len(n.Tags) > 50 {
		result.Errors = append(result.Errors, "too many tags (max 50)")
	} else if len(n.Tags) > 20 {
		result.Warnings = append(result.Warnings, "many tags (>20)")
	}

	// Validate tag lengths
	for i, tag := range n.Tags {
		if len(tag) > 50 {
			result.Errors = append(result.Errors, fmt.Sprintf("tag %d exceeds 50 characters", i))
		}
		if strings.ContainsAny(tag, ",;|/\\") {
			result.Warnings = append(result.Warnings, fmt.Sprintf("tag %d contains special characters", i))
		}
	}

	// Check for PII in text fields
	fields := map[string]string{
		"hypothesis": n.Hypothesis,
		"context":    n.Context,
		"intent":     n.Intent,
	}

	for fieldName, text := range fields {
		if findings := privacy.DetectPII(text); len(findings) > 0 {
			result.PIIFindings = append(result.PIIFindings, findings...)
			result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in %s field", fieldName))
		}
	}

	return result
}

// ValidateOutcome validates an Outcome struct.
func ValidateOutcome(o *Outcome) OutcomeValidation {
	result := OutcomeValidation{
		Warnings: make([]string, 0),
		Errors:   make([]string, 0),
	}

	if o == nil {
		return result
	}

	// Validate status
	if !isValidOutcomeStatus(o.Status) {
		result.Errors = append(result.Errors, fmt.Sprintf("invalid status: %s (must be validated, invalidated, inconclusive, partial, or empty)", o.Status))
	}

	// Validate summary length
	if len(o.Summary) > 1000 {
		result.Errors = append(result.Errors, "summary exceeds 1000 characters")
	} else if len(o.Summary) > 200 {
		result.Warnings = append(result.Warnings, "summary is long (>200 chars)")
	}

	// Validate key learnings count
	if len(o.KeyLearnings) > 5 {
		result.Errors = append(result.Errors, "too many key learnings (max 5)")
	}

	// Validate key learning lengths
	for i, learning := range o.KeyLearnings {
		if len(learning) > 500 {
			result.Errors = append(result.Errors, fmt.Sprintf("key learning %d exceeds 500 characters", i))
		}
	}

	// Validate follow-up runs references
	if len(o.FollowUpRuns) > 10 {
		result.Warnings = append(result.Warnings, "many follow-up runs (>10)")
	}

	// Check for PII in text fields
	if findings := privacy.DetectPII(o.Summary); len(findings) > 0 {
		result.Warnings = append(result.Warnings, "potential PII detected in summary")
	}

	for i, learning := range o.KeyLearnings {
		if findings := privacy.DetectPII(learning); len(findings) > 0 {
			result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in key learning %d", i))
		}
	}

	return result
}