Files
mGPUmanager/internal/gpu/gpu.go
mAi ca9bb1773f feat: Schritt 5 — VRAM-pressure eviction + coexistence groups
scheduler.Evicting wraps the Locked scheduler with the design's
LRU-with-coexistence eviction loop. main.go switches to it.

Per-job flow:
1. ensureFits — compare cons.vram_resident_mib + 256 MiB cushion against
   the live nvidia-smi free reading. If insufficient, pick the LRU
   loaded consumer NOT in cons.can_coexist_with, NOT VRAM-managed
   (ollama is excluded from eviction by design — it runs its own LRU),
   and NOT the target itself, then call its unload route. Wait 1s for
   VRAM to actually free. Repeat up to 5 times.
2. ensureLoaded — if the target was previously unloaded, call its
   /api/admin/load (mvoice). Consumers without a load route are
   assumed to cold-start implicitly on first request.
3. inner.Run — global GPU lock + job execution.

State:
- scheduler-local 'loaded' map + scheduler-local 'lastUsed' map. The
  registry's health-derived Loaded field is the source of truth for
  consumers that report it, but we need our own state for the seconds
  between an unload call and the next probe.
- Stats.Evictions counts successful unload calls and surfaces through
  /v1/status.

LRU pick order:
- Scheduler-local lastUsed (set on successful Run completion) takes
  precedence over registry.LastUsed (set on health probes) because the
  former reflects real GPU work, not health chatter. Zero-time
  consumers (never used) lose first.

Tests:
- Already-resident target: no eviction calls.
- 13 GiB comfyui evicted to fit 2.8 GiB mvoice → 1 unload + 1 load,
  Stats.Evictions = 1.
- Coexistent consumer (ollama, in mvoice.can_coexist_with) is never
  picked even if it's the LRU candidate; the non-coexistent comfyui
  is unloaded instead.

Race detector clean.

Refs: m/mGPUmanager#1 (Schritt 5).
2026-05-11 13:37:03 +02:00

130 lines
3.2 KiB
Go

// Package gpu polls nvidia-smi for live VRAM usage.
//
// Schritt 5 uses this to detect VRAM pressure and trigger LRU eviction.
// On hosts without an NVIDIA GPU (e.g. m's laptop during local dev) the
// poller silently reports zero usage so the scheduler can still run.
package gpu
import (
"context"
"log/slog"
"os/exec"
"strconv"
"strings"
"sync"
"time"
)
// Sample is one nvidia-smi reading.
type Sample struct {
UsedMiB int
FreeMiB int
TotalMiB int
At time.Time
Err string
}
// Poller periodically samples GPU memory and exposes the latest reading.
type Poller struct {
interval time.Duration
logger *slog.Logger
mu sync.RWMutex
last Sample
}
// NewPoller builds a Poller. Pass the desired sampling cadence.
func NewPoller(interval time.Duration, logger *slog.Logger) *Poller {
if interval <= 0 {
interval = 2 * time.Second
}
return &Poller{interval: interval, logger: logger}
}
// Run samples in a loop until ctx is cancelled.
func (p *Poller) Run(ctx context.Context) {
p.sampleOnce(ctx)
t := time.NewTicker(p.interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
p.sampleOnce(ctx)
}
}
}
// Last returns the most recent sample.
func (p *Poller) Last() Sample {
p.mu.RLock()
defer p.mu.RUnlock()
return p.last
}
// SetSampleForTest injects a synthetic VRAM reading. Used from tests that
// must drive the scheduler's eviction logic without a real GPU or
// nvidia-smi. Production callers should never reach this.
func SetSampleForTest(p *Poller, freeMiB, totalMiB int) {
p.store(Sample{
FreeMiB: freeMiB,
TotalMiB: totalMiB,
UsedMiB: totalMiB - freeMiB,
At: time.Now(),
})
}
func (p *Poller) sampleOnce(ctx context.Context) {
cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
// memory.used,memory.free,memory.total in MiB, no units, no header.
cmd := exec.CommandContext(cctx, "nvidia-smi",
"--query-gpu=memory.used,memory.free,memory.total",
"--format=csv,noheader,nounits")
out, err := cmd.Output()
now := time.Now()
if err != nil {
p.store(Sample{At: now, Err: err.Error()})
if p.logger != nil {
p.logger.Debug("nvidia-smi failed", "err", err)
}
return
}
used, free, total, perr := parseSMI(string(out))
if perr != "" {
p.store(Sample{At: now, Err: perr})
return
}
p.store(Sample{UsedMiB: used, FreeMiB: free, TotalMiB: total, At: now})
}
func (p *Poller) store(s Sample) {
p.mu.Lock()
p.last = s
p.mu.Unlock()
}
func parseSMI(out string) (used, free, total int, errMsg string) {
// Take first non-empty line — multi-GPU hosts would yield more, but we
// only support single-GPU (mRock) for Phase 1.
for line := range strings.SplitSeq(out, "\n") {
line = strings.TrimSpace(line)
if line == "" {
continue
}
parts := strings.Split(line, ",")
if len(parts) != 3 {
return 0, 0, 0, "unexpected nvidia-smi output: " + line
}
u, e1 := strconv.Atoi(strings.TrimSpace(parts[0]))
f, e2 := strconv.Atoi(strings.TrimSpace(parts[1]))
t, e3 := strconv.Atoi(strings.TrimSpace(parts[2]))
if e1 != nil || e2 != nil || e3 != nil {
return 0, 0, 0, "non-integer nvidia-smi output: " + line
}
return u, f, t, ""
}
return 0, 0, 0, "empty nvidia-smi output"
}