scheduler.Evicting wraps the Locked scheduler with the design's LRU-with-coexistence eviction loop. main.go switches to it. Per-job flow: 1. ensureFits — compare cons.vram_resident_mib + 256 MiB cushion against the live nvidia-smi free reading. If insufficient, pick the LRU loaded consumer NOT in cons.can_coexist_with, NOT VRAM-managed (ollama is excluded from eviction by design — it runs its own LRU), and NOT the target itself, then call its unload route. Wait 1s for VRAM to actually free. Repeat up to 5 times. 2. ensureLoaded — if the target was previously unloaded, call its /api/admin/load (mvoice). Consumers without a load route are assumed to cold-start implicitly on first request. 3. inner.Run — global GPU lock + job execution. State: - scheduler-local 'loaded' map + scheduler-local 'lastUsed' map. The registry's health-derived Loaded field is the source of truth for consumers that report it, but we need our own state for the seconds between an unload call and the next probe. - Stats.Evictions counts successful unload calls and surfaces through /v1/status. LRU pick order: - Scheduler-local lastUsed (set on successful Run completion) takes precedence over registry.LastUsed (set on health probes) because the former reflects real GPU work, not health chatter. Zero-time consumers (never used) lose first. Tests: - Already-resident target: no eviction calls. - 13 GiB comfyui evicted to fit 2.8 GiB mvoice → 1 unload + 1 load, Stats.Evictions = 1. - Coexistent consumer (ollama, in mvoice.can_coexist_with) is never picked even if it's the LRU candidate; the non-coexistent comfyui is unloaded instead. Race detector clean. Refs: m/mGPUmanager#1 (Schritt 5).
130 lines
3.2 KiB
Go
130 lines
3.2 KiB
Go
// Package gpu polls nvidia-smi for live VRAM usage.
|
|
//
|
|
// Schritt 5 uses this to detect VRAM pressure and trigger LRU eviction.
|
|
// On hosts without an NVIDIA GPU (e.g. m's laptop during local dev) the
|
|
// poller silently reports zero usage so the scheduler can still run.
|
|
package gpu
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Sample is one nvidia-smi reading.
|
|
type Sample struct {
|
|
UsedMiB int
|
|
FreeMiB int
|
|
TotalMiB int
|
|
At time.Time
|
|
Err string
|
|
}
|
|
|
|
// Poller periodically samples GPU memory and exposes the latest reading.
|
|
type Poller struct {
|
|
interval time.Duration
|
|
logger *slog.Logger
|
|
mu sync.RWMutex
|
|
last Sample
|
|
}
|
|
|
|
// NewPoller builds a Poller. Pass the desired sampling cadence.
|
|
func NewPoller(interval time.Duration, logger *slog.Logger) *Poller {
|
|
if interval <= 0 {
|
|
interval = 2 * time.Second
|
|
}
|
|
return &Poller{interval: interval, logger: logger}
|
|
}
|
|
|
|
// Run samples in a loop until ctx is cancelled.
|
|
func (p *Poller) Run(ctx context.Context) {
|
|
p.sampleOnce(ctx)
|
|
t := time.NewTicker(p.interval)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-t.C:
|
|
p.sampleOnce(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last returns the most recent sample.
|
|
func (p *Poller) Last() Sample {
|
|
p.mu.RLock()
|
|
defer p.mu.RUnlock()
|
|
return p.last
|
|
}
|
|
|
|
// SetSampleForTest injects a synthetic VRAM reading. Used from tests that
|
|
// must drive the scheduler's eviction logic without a real GPU or
|
|
// nvidia-smi. Production callers should never reach this.
|
|
func SetSampleForTest(p *Poller, freeMiB, totalMiB int) {
|
|
p.store(Sample{
|
|
FreeMiB: freeMiB,
|
|
TotalMiB: totalMiB,
|
|
UsedMiB: totalMiB - freeMiB,
|
|
At: time.Now(),
|
|
})
|
|
}
|
|
|
|
func (p *Poller) sampleOnce(ctx context.Context) {
|
|
cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
|
defer cancel()
|
|
|
|
// memory.used,memory.free,memory.total in MiB, no units, no header.
|
|
cmd := exec.CommandContext(cctx, "nvidia-smi",
|
|
"--query-gpu=memory.used,memory.free,memory.total",
|
|
"--format=csv,noheader,nounits")
|
|
out, err := cmd.Output()
|
|
now := time.Now()
|
|
if err != nil {
|
|
p.store(Sample{At: now, Err: err.Error()})
|
|
if p.logger != nil {
|
|
p.logger.Debug("nvidia-smi failed", "err", err)
|
|
}
|
|
return
|
|
}
|
|
used, free, total, perr := parseSMI(string(out))
|
|
if perr != "" {
|
|
p.store(Sample{At: now, Err: perr})
|
|
return
|
|
}
|
|
p.store(Sample{UsedMiB: used, FreeMiB: free, TotalMiB: total, At: now})
|
|
}
|
|
|
|
func (p *Poller) store(s Sample) {
|
|
p.mu.Lock()
|
|
p.last = s
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
func parseSMI(out string) (used, free, total int, errMsg string) {
|
|
// Take first non-empty line — multi-GPU hosts would yield more, but we
|
|
// only support single-GPU (mRock) for Phase 1.
|
|
for line := range strings.SplitSeq(out, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
parts := strings.Split(line, ",")
|
|
if len(parts) != 3 {
|
|
return 0, 0, 0, "unexpected nvidia-smi output: " + line
|
|
}
|
|
u, e1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
f, e2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
t, e3 := strconv.Atoi(strings.TrimSpace(parts[2]))
|
|
if e1 != nil || e2 != nil || e3 != nil {
|
|
return 0, 0, 0, "non-integer nvidia-smi output: " + line
|
|
}
|
|
return u, f, t, ""
|
|
}
|
|
return 0, 0, 0, "empty nvidia-smi output"
|
|
}
|