Add comprehensive research context tracking to jobs: - Narrative fields: hypothesis, context, intent, expected_outcome - Experiment groups and tags for organization - Run comparison (compare command) for diff analysis - Run search (find command) with criteria filtering - Run export (export command) for data portability - Outcome setting (outcome command) for experiment validation Update queue and requeue commands to support narrative fields. Add narrative validation to manifest validator. Add WebSocket handlers for compare, find, export, and outcome operations. Includes E2E tests for phase 2 features.
301 lines
7.8 KiB
Go
301 lines
7.8 KiB
Go
package manifest
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"strings"
|
|
|
|
"github.com/jfraeys/fetch_ml/internal/privacy"
|
|
)
|
|
|
|
// ErrIncompleteManifest is returned when a required manifest field is missing.
|
|
var ErrIncompleteManifest = errors.New("incomplete manifest")
|
|
|
|
// Validator validates that a RunManifest is complete before execution.
|
|
type Validator struct {
|
|
requiredFields []string
|
|
}
|
|
|
|
// NewValidator creates a new manifest validator with default required fields.
|
|
func NewValidator() *Validator {
|
|
return &Validator{
|
|
requiredFields: []string{
|
|
"commit_id",
|
|
"deps_manifest_sha256",
|
|
},
|
|
}
|
|
}
|
|
|
|
// NewValidatorWithFields creates a validator with custom required fields.
|
|
func NewValidatorWithFields(fields []string) *Validator {
|
|
return &Validator{
|
|
requiredFields: fields,
|
|
}
|
|
}
|
|
|
|
// ValidationError contains details about a validation failure.
|
|
type ValidationError struct {
|
|
Field string `json:"field"`
|
|
Message string `json:"message"`
|
|
}
|
|
|
|
// Error returns the error string.
|
|
func (e ValidationError) Error() string {
|
|
return fmt.Sprintf("validation error for field '%s': %s", e.Field, e.Message)
|
|
}
|
|
|
|
// Validate checks that the manifest has all required fields.
|
|
// Returns an error listing all missing fields.
|
|
func (v *Validator) Validate(m *RunManifest) error {
|
|
if m == nil {
|
|
return fmt.Errorf("manifest is nil: %w", ErrIncompleteManifest)
|
|
}
|
|
|
|
var validationErrors []ValidationError
|
|
|
|
for _, field := range v.requiredFields {
|
|
if err := v.validateField(m, field); err != nil {
|
|
validationErrors = append(validationErrors, *err)
|
|
}
|
|
}
|
|
|
|
if len(validationErrors) > 0 {
|
|
// Build comprehensive error message
|
|
msg := "manifest validation failed:\n"
|
|
for _, err := range validationErrors {
|
|
msg += fmt.Sprintf(" - %s\n", err.Error())
|
|
}
|
|
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ValidateStrict fails if ANY optional fields commonly used for provenance are missing.
|
|
// This is for high-assurance environments.
|
|
func (v *Validator) ValidateStrict(m *RunManifest) error {
|
|
if err := v.Validate(m); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Additional strict checks
|
|
var strictErrors []ValidationError
|
|
|
|
if m.WorkerVersion == "" {
|
|
strictErrors = append(strictErrors, ValidationError{
|
|
Field: "worker_version",
|
|
Message: "required for strict provenance",
|
|
})
|
|
}
|
|
|
|
if m.PodmanImage == "" {
|
|
strictErrors = append(strictErrors, ValidationError{
|
|
Field: "podman_image",
|
|
Message: "required for strict provenance",
|
|
})
|
|
}
|
|
|
|
if len(strictErrors) > 0 {
|
|
msg := "strict manifest validation failed:\n"
|
|
for _, err := range strictErrors {
|
|
msg += fmt.Sprintf(" - %s\n", err.Error())
|
|
}
|
|
return fmt.Errorf("%s: %w", msg, ErrIncompleteManifest)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// validateField checks a single required field.
|
|
func (v *Validator) validateField(m *RunManifest, field string) *ValidationError {
|
|
switch field {
|
|
case "commit_id":
|
|
if m.CommitID == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "commit_id is required for code provenance",
|
|
}
|
|
}
|
|
case "deps_manifest_sha256":
|
|
if m.DepsManifestSHA == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "deps_manifest_sha256 is required for dependency provenance",
|
|
}
|
|
}
|
|
case "run_id":
|
|
if m.RunID == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "run_id is required",
|
|
}
|
|
}
|
|
case "task_id":
|
|
if m.TaskID == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "task_id is required",
|
|
}
|
|
}
|
|
case "job_name":
|
|
if m.JobName == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "job_name is required",
|
|
}
|
|
}
|
|
case "snapshot_sha256":
|
|
if m.SnapshotID != "" && m.SnapshotSHA256 == "" {
|
|
return &ValidationError{
|
|
Field: field,
|
|
Message: "snapshot_sha256 is required when snapshot_id is provided",
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// IsValidationError checks if an error is a manifest validation error.
|
|
func IsValidationError(err error) bool {
|
|
return errors.Is(err, ErrIncompleteManifest)
|
|
}
|
|
|
|
// NarrativeValidation contains validation results.
|
|
type NarrativeValidation struct {
|
|
Warnings []string `json:"warnings,omitempty"`
|
|
Errors []string `json:"errors,omitempty"`
|
|
PIIFindings []privacy.PIIFinding `json:"pii_findings,omitempty"`
|
|
}
|
|
|
|
// OutcomeValidation contains validation results.
|
|
type OutcomeValidation struct {
|
|
Warnings []string `json:"warnings,omitempty"`
|
|
Errors []string `json:"errors,omitempty"`
|
|
}
|
|
|
|
// Valid outcome statuses.
|
|
var ValidOutcomeStatuses = []string{
|
|
"validated", "invalidated", "inconclusive", "partial", "",
|
|
}
|
|
|
|
// isValidOutcomeStatus checks if status is valid.
|
|
func isValidOutcomeStatus(status string) bool {
|
|
for _, s := range ValidOutcomeStatuses {
|
|
if s == status {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// ValidateNarrative validates a Narrative struct.
|
|
func ValidateNarrative(n *Narrative) NarrativeValidation {
|
|
result := NarrativeValidation{
|
|
Warnings: make([]string, 0),
|
|
Errors: make([]string, 0),
|
|
}
|
|
|
|
if n == nil {
|
|
return result
|
|
}
|
|
|
|
// Validate hypothesis length
|
|
if len(n.Hypothesis) > 5000 {
|
|
result.Errors = append(result.Errors, "hypothesis exceeds 5000 characters")
|
|
} else if len(n.Hypothesis) > 1000 {
|
|
result.Warnings = append(result.Warnings, "hypothesis is very long (>1000 chars)")
|
|
}
|
|
|
|
// Validate context length
|
|
if len(n.Context) > 10000 {
|
|
result.Errors = append(result.Errors, "context exceeds 10000 characters")
|
|
}
|
|
|
|
// Validate tags count
|
|
if len(n.Tags) > 50 {
|
|
result.Errors = append(result.Errors, "too many tags (max 50)")
|
|
} else if len(n.Tags) > 20 {
|
|
result.Warnings = append(result.Warnings, "many tags (>20)")
|
|
}
|
|
|
|
// Validate tag lengths
|
|
for i, tag := range n.Tags {
|
|
if len(tag) > 50 {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("tag %d exceeds 50 characters", i))
|
|
}
|
|
if strings.ContainsAny(tag, ",;|/\\") {
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("tag %d contains special characters", i))
|
|
}
|
|
}
|
|
|
|
// Check for PII in text fields
|
|
fields := map[string]string{
|
|
"hypothesis": n.Hypothesis,
|
|
"context": n.Context,
|
|
"intent": n.Intent,
|
|
}
|
|
|
|
for fieldName, text := range fields {
|
|
if findings := privacy.DetectPII(text); len(findings) > 0 {
|
|
result.PIIFindings = append(result.PIIFindings, findings...)
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in %s field", fieldName))
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|
|
|
|
// ValidateOutcome validates an Outcome struct.
|
|
func ValidateOutcome(o *Outcome) OutcomeValidation {
|
|
result := OutcomeValidation{
|
|
Warnings: make([]string, 0),
|
|
Errors: make([]string, 0),
|
|
}
|
|
|
|
if o == nil {
|
|
return result
|
|
}
|
|
|
|
// Validate status
|
|
if !isValidOutcomeStatus(o.Status) {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("invalid status: %s (must be validated, invalidated, inconclusive, partial, or empty)", o.Status))
|
|
}
|
|
|
|
// Validate summary length
|
|
if len(o.Summary) > 1000 {
|
|
result.Errors = append(result.Errors, "summary exceeds 1000 characters")
|
|
} else if len(o.Summary) > 200 {
|
|
result.Warnings = append(result.Warnings, "summary is long (>200 chars)")
|
|
}
|
|
|
|
// Validate key learnings count
|
|
if len(o.KeyLearnings) > 5 {
|
|
result.Errors = append(result.Errors, "too many key learnings (max 5)")
|
|
}
|
|
|
|
// Validate key learning lengths
|
|
for i, learning := range o.KeyLearnings {
|
|
if len(learning) > 500 {
|
|
result.Errors = append(result.Errors, fmt.Sprintf("key learning %d exceeds 500 characters", i))
|
|
}
|
|
}
|
|
|
|
// Validate follow-up runs references
|
|
if len(o.FollowUpRuns) > 10 {
|
|
result.Warnings = append(result.Warnings, "many follow-up runs (>10)")
|
|
}
|
|
|
|
// Check for PII in text fields
|
|
if findings := privacy.DetectPII(o.Summary); len(findings) > 0 {
|
|
result.Warnings = append(result.Warnings, "potential PII detected in summary")
|
|
}
|
|
|
|
for i, learning := range o.KeyLearnings {
|
|
if findings := privacy.DetectPII(learning); len(findings) > 0 {
|
|
result.Warnings = append(result.Warnings, fmt.Sprintf("potential PII detected in key learning %d", i))
|
|
}
|
|
}
|
|
|
|
return result
|
|
}
|