Memoh/internal/agent/background/manager.go

// Package background implements a background task manager for long-running
// commands executed inside bot containers. It follows a task-notification
// architecture:
//
//  1. Commands can be started in the background (fire-and-forget).
//  2. Output is collected asynchronously and written to a file in the container.
//  3. When a task completes, a structured Notification is enqueued.
//  4. Notifications are scoped to (botID, sessionID); the agent loop drains
//     them at step boundaries and injects them as context messages so the
//     model learns about completed work.
//
// The manager is a server-level singleton, safe for concurrent use.
package background

import (
	"context"
	"crypto/rand"
	"encoding/hex"
	"fmt"
	"log/slog"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/memohai/memoh/internal/workspace/bridge"
)

const (
	// DefaultExecTimeout is the default timeout for foreground exec calls.
	DefaultExecTimeout int32 = 30
	// MaxExecTimeout is the maximum allowed timeout (10 minutes).
	MaxExecTimeout int32 = 600
	// BackgroundExecTimeout is the timeout for background tasks (30 minutes).
	BackgroundExecTimeout int32 = 1800
	// DefaultCleanupInterval is how often the manager prunes old completed tasks.
	DefaultCleanupInterval = time.Hour
	// DefaultTaskRetention is how long completed tasks are retained in memory.
	DefaultTaskRetention = 24 * time.Hour
	// OutputLogDir is the directory inside the container where background
	// task output logs are written.
	OutputLogDir = "/tmp/memoh-bg"

	// stallCheckInterval is how often the stall watchdog checks output growth.
	stallCheckInterval = 5 * time.Second
	// stallThreshold is the duration of zero output growth before we consider
	// the command stalled and possibly waiting for interactive input.
	stallThreshold = 45 * time.Second
)

// ExecFunc executes a command in a container and returns the result.
// This is the signature that bridge.Client.Exec satisfies.
type ExecFunc func(ctx context.Context, command, workDir string, timeout int32) (*bridge.ExecResult, error)

// WriteFileFunc writes content to a file in the container.
type WriteFileFunc func(ctx context.Context, path string, data []byte) error

// ReadFileFunc reads content from a file in the container.
type ReadFileFunc func(ctx context.Context, path string) ([]byte, error)

// Manager tracks background tasks and delivers completion notifications.
type Manager struct {
	mu            sync.Mutex
	tasks         map[string]*Task // taskID -> Task
	notifications []Notification   // pending notifications, protected by mu
	logger        *slog.Logger
	wakeFunc      func(botID, sessionID string) // optional callback to wake agent on new notification
}

// New creates a new background task Manager.
func New(logger *slog.Logger) *Manager {
	if logger == nil {
		logger = slog.Default()
	}
	return &Manager{
		tasks:  make(map[string]*Task),
		logger: logger.With(slog.String("service", "background")),
	}
}

// SetWakeFunc registers a callback that is invoked (in a goroutine) whenever a
// new notification is enqueued. Use this to wake up a sleeping agent so it
// can drain the notification immediately instead of waiting for user input.
func (m *Manager) SetWakeFunc(fn func(botID, sessionID string)) {
	m.mu.Lock()
	m.wakeFunc = fn
	m.mu.Unlock()
}

// enqueueNotification appends n to the pending list and, if a wake function is
// registered, calls it asynchronously so the agent can process the notification.
func (m *Manager) enqueueNotification(n Notification) {
	m.mu.Lock()
	m.notifications = append(m.notifications, n)
	wakeFn := m.wakeFunc
	m.mu.Unlock()
	m.logger.Info("notification enqueued",
		slog.String("task_id", n.TaskID),
		slog.String("bot_id", n.BotID),
		slog.Bool("has_wake_func", wakeFn != nil),
	)
	if wakeFn != nil {
		go wakeFn(n.BotID, n.SessionID)
	}
}

// Spawn starts a command in the background. It returns the task ID immediately.
// The command runs asynchronously; when it completes, a Notification is sent
// to the Notifications channel.
//
// execFn should call bridge.Client.Exec (or equivalent).
// writeFn should call bridge.Client.WriteFile to persist output logs.
func (m *Manager) Spawn(
	parentCtx context.Context,
	botID, sessionID, command, workDir, description string,
	execFn ExecFunc,
	writeFn WriteFileFunc,
	readFn ReadFileFunc,
) (taskID, outputFile string) {
	m.mu.Lock()
	taskID = m.newTaskIDLocked(botID)
	outputFile = fmt.Sprintf("%s/%s.log", OutputLogDir, taskID)

	task := &Task{
		ID:          taskID,
		BotID:       botID,
		SessionID:   sessionID,
		Command:     command,
		Description: description,
		WorkDir:     workDir,
		Status:      TaskRunning,
		OutputFile:  outputFile,
		StartedAt:   time.Now(),
	}
	m.tasks[taskID] = task
	m.mu.Unlock()

	m.logger.Info("background task spawned",
		slog.String("task_id", taskID),
		slog.String("bot_id", botID),
		slog.String("command", truncate(command, 120)),
	)

	go m.run(parentCtx, task, execFn, writeFn, readFn)
	return taskID, outputFile
}

// SpawnAdopt registers a background task for a command that is already running
// externally (e.g. via ExecStream). Instead of re-executing the command, it
// waits for the result on the provided channel. This enables "flip to background"
// where a foreground stream is handed off without killing the process.
func (m *Manager) SpawnAdopt(
	parentCtx context.Context,
	botID, sessionID, command, workDir, description string,
	resultCh <-chan AdoptResult,
	writeFn WriteFileFunc,
) (taskID, outputFile string) {
	m.mu.Lock()
	taskID = m.newTaskIDLocked(botID)
	outputFile = fmt.Sprintf("%s/%s.log", OutputLogDir, taskID)

	task := &Task{
		ID:          taskID,
		BotID:       botID,
		SessionID:   sessionID,
		Command:     command,
		Description: description,
		WorkDir:     workDir,
		Status:      TaskRunning,
		OutputFile:  outputFile,
		StartedAt:   time.Now(),
	}
	m.tasks[taskID] = task
	m.mu.Unlock()

	m.logger.Info("background task adopted",
		slog.String("task_id", taskID),
		slog.String("bot_id", botID),
		slog.String("command", truncate(command, 120)),
	)

	go m.runAdopt(parentCtx, task, resultCh, writeFn)
	return taskID, outputFile
}

func (m *Manager) newTaskIDLocked(botID string) string {
	prefix := botID[:min(8, len(botID))]
	for {
		id := fmt.Sprintf("bg_%s_%s", prefix, shortRandHex(4))
		if _, exists := m.tasks[id]; !exists {
			return id
		}
	}
}

func shortRandHex(n int) string {
	if n <= 0 {
		n = 4
	}
	buf := make([]byte, n)
	if _, err := rand.Read(buf); err != nil {
		panic(fmt.Errorf("background: read random bytes: %w", err))
	}
	return hex.EncodeToString(buf)
}

// runAdopt waits for the adopted stream result and handles completion.
func (m *Manager) runAdopt(parentCtx context.Context, task *Task, resultCh <-chan AdoptResult, writeFn WriteFileFunc) {
	ctx, cancel := detachedContextWithTimeout(parentCtx, time.Duration(BackgroundExecTimeout)*time.Second)
	task.mu.Lock()
	task.cancel = cancel
	task.mu.Unlock()
	defer cancel()

	// Ensure output directory exists.
	_ = ensureOutputDir(ctx, writeFn)

	// Start stall watchdog.
	go m.stallWatchdog(ctx, task)

	// Wait for the result from the already-running stream.
	var result AdoptResult
	select {
	case result = <-resultCh:
	case <-ctx.Done():
		result = AdoptResult{Err: ctx.Err()}
	}

	// Write output to log file in container.
	if writeFn != nil && result.Err == nil {
		combined := result.Stdout
		if result.Stderr != "" {
			combined += "\n--- stderr ---\n" + result.Stderr
		}
		_ = writeFn(context.WithoutCancel(ctx), task.OutputFile, []byte(combined))
	}

	m.completeTask(task, result.Stdout, result.Stderr, result.Err, result.ExitCode)
}

func (m *Manager) run(parentCtx context.Context, task *Task, execFn ExecFunc, writeFn WriteFileFunc, readFn ReadFileFunc) {
	ctx, cancel := detachedContextWithTimeout(parentCtx, time.Duration(BackgroundExecTimeout)*time.Second)
	task.mu.Lock()
	task.cancel = cancel
	task.mu.Unlock()
	defer cancel()

	// Ensure output directory exists.
	_ = ensureOutputDir(ctx, writeFn)

	// Start stall watchdog to detect commands waiting for interactive input.
	go m.stallWatchdog(ctx, task)

	// Wrap command to tee output to the log file inside the container and
	// capture the command exit code into a sentinel file via fd 3 redirect.
	// Even if the gRPC stream dies after process completion, we can recover
	// the actual exit code by reading the sentinel file.
	wrappedCmd := fmt.Sprintf(
		"{ { ( %s ) ; echo $? >&3 ; } 2>&1 | tee %s ; } 3>%s.exit",
		task.Command, task.OutputFile, task.OutputFile,
	)

	result, err := execFn(ctx, wrappedCmd, task.WorkDir, BackgroundExecTimeout)
	if err != nil {
		m.logger.Warn("background task: execFn returned error",
			slog.String("task_id", task.ID),
			slog.Any("exec_error", err),
		)
	}

	// Always prefer the sentinel file for the real exit code.
	// The wrappedCmd uses a pipeline: the shell exits with tee's code (0),
	// not the actual command's code. The sentinel captures the real value.
	// On gRPC error the sentinel also lets us recover without -1.
	if readFn != nil {
		ec, recoverErr := readSentinelExitCode(ctx, task.OutputFile+".exit", readFn)
		if recoverErr == nil {
			if err != nil {
				m.logger.Info("background task: recovered exit code from sentinel file after stream error",
					slog.String("task_id", task.ID),
					slog.Int("recovered_exit_code", int(ec)),
					slog.Any("stream_error", err),
				)
			}
			result = &bridge.ExecResult{ExitCode: ec}
			err = nil
		} else if err != nil {
			m.logger.Warn("background task: sentinel recovery failed",
				slog.String("task_id", task.ID),
				slog.Any("recover_error", recoverErr),
			)
		}
		// If err==nil but sentinel unreadable: fall through to use gRPC exit code
	}

	var stdout, stderr string
	var exitCode int32
	if result != nil {
		stdout = result.Stdout
		stderr = result.Stderr
		exitCode = result.ExitCode
	}
	m.completeTask(task, stdout, stderr, err, exitCode)
}

func (m *Manager) completeTask(task *Task, stdout, stderr string, execErr error, exitCode int32) {
	if execErr != nil {
		task.AppendOutput(fmt.Sprintf("[error] %v\n", execErr))
	} else {
		task.AppendOutput(stdout)
		if stderr != "" {
			task.AppendOutput(stderr)
		}
	}

	task.mu.Lock()
	if task.Status == TaskKilled {
		task.mu.Unlock()
		return
	}
	task.CompletedAt = time.Now()
	if execErr != nil {
		task.Status = TaskFailed
		task.ExitCode = -1
	} else {
		task.ExitCode = exitCode
		if exitCode == 0 {
			task.Status = TaskCompleted
		} else {
			task.Status = TaskFailed
		}
	}
	status := task.Status
	finalExitCode := task.ExitCode
	duration := task.CompletedAt.Sub(task.StartedAt)
	task.mu.Unlock()

	m.logger.Info("background task finished",
		slog.String("task_id", task.ID),
		slog.String("status", string(status)),
		slog.Int("exit_code", int(finalExitCode)),
		slog.Duration("duration", duration),
	)

	// Guard against double notification when Kill or an auto-background race
	// already enqueued one for this task.
	if !task.MarkNotified() {
		return
	}

	m.enqueueNotification(Notification{
		TaskID:      task.ID,
		BotID:       task.BotID,
		SessionID:   task.SessionID,
		Status:      status,
		Command:     task.Command,
		Description: task.Description,
		ExitCode:    finalExitCode,
		OutputFile:  task.OutputFile,
		OutputTail:  task.OutputTail(),
		Duration:    duration,
	})
}

func readSentinelExitCode(ctx context.Context, path string, readFn ReadFileFunc) (int32, error) {
	data, err := readFn(ctx, path)
	if err != nil {
		return 0, err
	}
	ec, err := strconv.Atoi(strings.TrimSpace(string(data)))
	if err != nil {
		return 0, fmt.Errorf("parse exit code %q: %w", string(data), err)
	}
	return int32(ec), nil //nolint:gosec // G115: exit codes are 0-255
}

func ensureOutputDir(ctx context.Context, writeFn WriteFileFunc) error {
	if writeFn == nil {
		return nil
	}
	// Create a marker file to ensure the directory exists.
	return writeFn(ctx, OutputLogDir+"/.keep", []byte(""))
}

// Kill cancels a running background task.
func (m *Manager) Kill(taskID string) error {
	m.mu.Lock()
	task, ok := m.tasks[taskID]
	m.mu.Unlock()
	if !ok {
		return fmt.Errorf("task %s not found", taskID)
	}
	task.mu.Lock()
	if task.Status != TaskRunning {
		task.mu.Unlock()
		return fmt.Errorf("task %s is not running (status: %s)", taskID, task.Status)
	}
	task.Status = TaskKilled
	task.CompletedAt = time.Now()
	task.mu.Unlock()

	task.Cancel()
	m.logger.Info("background task killed", slog.String("task_id", taskID))
	return nil
}

// Get returns a task by ID, or nil if not found.
func (m *Manager) Get(taskID string) *Task {
	m.mu.Lock()
	defer m.mu.Unlock()
	return m.tasks[taskID]
}

// GetForSession returns a task by ID only if it belongs to the provided
// bot+session.
func (m *Manager) GetForSession(botID, sessionID, taskID string) *Task {
	m.mu.Lock()
	defer m.mu.Unlock()
	task := m.tasks[taskID]
	if task == nil || task.BotID != botID || task.SessionID != sessionID {
		return nil
	}
	return task
}

// ListForSession returns all tasks for a given bot+session, most recent first.
func (m *Manager) ListForSession(botID, sessionID string) []*Task {
	m.mu.Lock()
	defer m.mu.Unlock()
	var result []*Task
	for _, t := range m.tasks {
		if t.BotID == botID && t.SessionID == sessionID {
			result = append(result, t)
		}
	}
	return result
}

// KillForSession cancels a running background task only when it belongs to the
// provided bot+session.
func (m *Manager) KillForSession(botID, sessionID, taskID string) error {
	task := m.GetForSession(botID, sessionID, taskID)
	if task == nil {
		return fmt.Errorf("task %s not found", taskID)
	}
	return m.Kill(taskID)
}

// DrainNotifications returns all pending notifications for a given
// bot+session without blocking. Used by the resolver to inject
// notifications at the start of a new agent run.
func (m *Manager) DrainNotifications(botID, sessionID string) []Notification {
	m.mu.Lock()
	defer m.mu.Unlock()

	var matched []Notification
	remaining := m.notifications[:0] // reuse backing array
	for _, n := range m.notifications {
		if n.BotID == botID && n.SessionID == sessionID {
			matched = append(matched, n)
		} else {
			remaining = append(remaining, n)
		}
	}
	m.notifications = remaining
	return matched
}

// HasNotifications reports whether there are pending notifications for the
// given bot+session without consuming them.
func (m *Manager) HasNotifications(botID, sessionID string) bool {
	m.mu.Lock()
	defer m.mu.Unlock()
	for _, n := range m.notifications {
		if n.BotID == botID && n.SessionID == sessionID {
			return true
		}
	}
	return false
}

// RunningTasksSummary returns a text summary of currently running tasks
// for a given bot+session. This is injected into the system prompt so the
// agent knows about ongoing background work.
func (m *Manager) RunningTasksSummary(botID, sessionID string) string {
	m.mu.Lock()
	defer m.mu.Unlock()
	var lines []string
	for _, t := range m.tasks {
		t.mu.Lock()
		matches := t.BotID == botID && t.SessionID == sessionID && t.Status == TaskRunning
		id := t.ID
		desc := t.Description
		command := t.Command
		startedAt := t.StartedAt
		outputFile := t.OutputFile
		t.mu.Unlock()
		if !matches {
			continue
		}
		if desc == "" {
			desc = truncate(command, 80)
		}
		lines = append(lines, fmt.Sprintf("- [%s] %s (started %s ago, output: %s)",
			id, desc,
			time.Since(startedAt).Round(time.Second),
			outputFile,
		))
	}
	if len(lines) == 0 {
		return ""
	}
	return "Currently running background tasks:\n" + joinLines(lines)
}

// Cleanup removes completed tasks older than the given duration.
func (m *Manager) Cleanup(maxAge time.Duration) {
	m.mu.Lock()
	defer m.mu.Unlock()
	cutoff := time.Now().Add(-maxAge)
	for id, t := range m.tasks {
		if t.Status != TaskRunning && t.CompletedAt.Before(cutoff) {
			delete(m.tasks, id)
		}
	}
}

// StartCleanupLoop periodically removes old completed tasks until done is closed.
func (m *Manager) StartCleanupLoop(done <-chan struct{}, interval, maxAge time.Duration) {
	if interval <= 0 {
		interval = DefaultCleanupInterval
	}
	if maxAge <= 0 {
		maxAge = DefaultTaskRetention
	}

	ticker := time.NewTicker(interval)
	defer ticker.Stop()

	for {
		select {
		case <-ticker.C:
			m.Cleanup(maxAge)
		case <-done:
			return
		}
	}
}

// RequeueNotifications puts notifications back into the pending queue.
// Used when proactive delivery for a session fails and the batch should be retried.
func (m *Manager) RequeueNotifications(ns []Notification) {
	if len(ns) == 0 {
		return
	}
	m.mu.Lock()
	m.notifications = append(m.notifications, ns...)
	m.mu.Unlock()
}

// promptPatterns matches common interactive prompt endings that indicate
// a command is waiting for user input.
var promptPatterns = regexp.MustCompile(
	`(?i)(\$ ?$|> ?$|# ?$|password\s*:|passphrase\s*:|y/n\]|yes/no\)|enter .*:|Press .* to continue|Are you sure|Continue\?|Proceed\?)`,
)

// stallWatchdog monitors a background task's output for stalls that might
// indicate the command is waiting for interactive input. If detected, it
// enqueues a notification advising the agent to kill and retry.
func (m *Manager) stallWatchdog(ctx context.Context, task *Task) {
	ticker := time.NewTicker(stallCheckInterval)
	defer ticker.Stop()

	var lastLen int
	var stalledSince time.Time

	for {
		select {
		case <-ctx.Done():
			return
		case <-ticker.C:
		}

		task.mu.Lock()
		if task.Status != TaskRunning {
			task.mu.Unlock()
			return
		}
		currentLen := task.output.Len()
		// Read tail inline (we already hold the lock).
		tail := task.output.String()
		if len(tail) > maxTailBytes {
			tail = tail[len(tail)-maxTailBytes:]
		}
		task.mu.Unlock()

		if currentLen != lastLen {
			// Output is still growing — reset stall timer.
			lastLen = currentLen
			stalledSince = time.Time{}
			continue
		}

		// Output hasn't grown.
		if stalledSince.IsZero() {
			stalledSince = time.Now()
			continue
		}

		if time.Since(stalledSince) < stallThreshold {
			continue
		}

		// Stalled long enough. Check if the tail looks like an interactive prompt.
		if !promptPatterns.MatchString(tail) {
			continue
		}

		m.logger.Warn("background task appears stalled on interactive prompt",
			slog.String("task_id", task.ID),
		)

		// Enqueue a stall notification (only once).
		if !task.MarkNotified() {
			return
		}

		n := Notification{
			TaskID:      task.ID,
			BotID:       task.BotID,
			SessionID:   task.SessionID,
			Status:      TaskRunning, // still running, but stalled
			Command:     task.Command,
			Description: task.Description,
			ExitCode:    0,
			OutputFile:  task.OutputFile,
			OutputTail:  tail,
			Duration:    time.Since(task.StartedAt),
			Stalled:     true,
		}

		m.enqueueNotification(n)
		return // only notify once per task
	}
}

func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n] + "..."
}

func joinLines(lines []string) string {
	if len(lines) == 0 {
		return ""
	}
	return strings.Join(lines, "\n") + "\n"
}

func detachedContextWithTimeout(parentCtx context.Context, timeout time.Duration) (context.Context, context.CancelFunc) {
	if parentCtx == nil {
		parentCtx = context.Background()
	}
	return context.WithTimeout(context.WithoutCancel(parentCtx), timeout)
}