mGPUmanager/internal/gpu/gpu.go

// Package gpu polls nvidia-smi for live VRAM usage.
//
// Schritt 5 uses this to detect VRAM pressure and trigger LRU eviction.
// On hosts without an NVIDIA GPU (e.g. m's laptop during local dev) the
// poller silently reports zero usage so the scheduler can still run.
package gpu

import (
	"context"
	"log/slog"
	"os/exec"
	"strconv"
	"strings"
	"sync"
	"time"
)

// Sample is one nvidia-smi reading.
type Sample struct {
	UsedMiB  int
	FreeMiB  int
	TotalMiB int
	At       time.Time
	Err      string
}

// Poller periodically samples GPU memory and exposes the latest reading.
type Poller struct {
	interval time.Duration
	logger   *slog.Logger
	mu       sync.RWMutex
	last     Sample
}

// NewPoller builds a Poller. Pass the desired sampling cadence.
func NewPoller(interval time.Duration, logger *slog.Logger) *Poller {
	if interval <= 0 {
		interval = 2 * time.Second
	}
	return &Poller{interval: interval, logger: logger}
}

// Run samples in a loop until ctx is cancelled.
func (p *Poller) Run(ctx context.Context) {
	p.sampleOnce(ctx)
	t := time.NewTicker(p.interval)
	defer t.Stop()
	for {
		select {
		case <-ctx.Done():
			return
		case <-t.C:
			p.sampleOnce(ctx)
		}
	}
}

// Last returns the most recent sample.
func (p *Poller) Last() Sample {
	p.mu.RLock()
	defer p.mu.RUnlock()
	return p.last
}

// SetSampleForTest injects a synthetic VRAM reading. Used from tests that
// must drive the scheduler's eviction logic without a real GPU or
// nvidia-smi. Production callers should never reach this.
func SetSampleForTest(p *Poller, freeMiB, totalMiB int) {
	p.store(Sample{
		FreeMiB:  freeMiB,
		TotalMiB: totalMiB,
		UsedMiB:  totalMiB - freeMiB,
		At:       time.Now(),
	})
}

func (p *Poller) sampleOnce(ctx context.Context) {
	cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
	defer cancel()

	// memory.used,memory.free,memory.total in MiB, no units, no header.
	cmd := exec.CommandContext(cctx, "nvidia-smi",
		"--query-gpu=memory.used,memory.free,memory.total",
		"--format=csv,noheader,nounits")
	out, err := cmd.Output()
	now := time.Now()
	if err != nil {
		p.store(Sample{At: now, Err: err.Error()})
		if p.logger != nil {
			p.logger.Debug("nvidia-smi failed", "err", err)
		}
		return
	}
	used, free, total, perr := parseSMI(string(out))
	if perr != "" {
		p.store(Sample{At: now, Err: perr})
		return
	}
	p.store(Sample{UsedMiB: used, FreeMiB: free, TotalMiB: total, At: now})
}

func (p *Poller) store(s Sample) {
	p.mu.Lock()
	p.last = s
	p.mu.Unlock()
}

func parseSMI(out string) (used, free, total int, errMsg string) {
	// Take first non-empty line — multi-GPU hosts would yield more, but we
	// only support single-GPU (mRock) for Phase 1.
	for line := range strings.SplitSeq(out, "\n") {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}
		parts := strings.Split(line, ",")
		if len(parts) != 3 {
			return 0, 0, 0, "unexpected nvidia-smi output: " + line
		}
		u, e1 := strconv.Atoi(strings.TrimSpace(parts[0]))
		f, e2 := strconv.Atoi(strings.TrimSpace(parts[1]))
		t, e3 := strconv.Atoi(strings.TrimSpace(parts[2]))
		if e1 != nil || e2 != nil || e3 != nil {
			return 0, 0, 0, "non-integer nvidia-smi output: " + line
		}
		return u, f, t, ""
	}
	return 0, 0, 0, "empty nvidia-smi output"
}