Go daemon listening on :8770 that fronts mvoice (8766), whisper-server
(8178), ollama (11434), comfyui (8188) behind a single /v1 façade.
What this MVP does:
- Loads config/consumers.yaml: routing table, per-consumer URL + health +
paths + vram_resident_mib + can_coexist_with + load/unload routes.
- Background health probe (5s) on every consumer; refuses fast with a
structured 503 if the last probe failed (no Felix-Banholzer-style
silent fallback).
- POST /v1/{tts,stt,llm,image} proxies the request body + Content-Type
to the routed consumer's path and streams the response back.
- GET /audio/* proxies to audio_proxy consumer (wa.sh fetches its WAV
this way).
- GET /v1/status exposes live GPU sample (nvidia-smi every 2s),
per-consumer health/loaded/gpu_resident_mib/active/total_requests,
scheduler stats.
- GET /healthz, GET / — broker liveness.
The Scheduler interface is in place but the implementation is
'Passthrough' — every job runs immediately, no lock, no queue. Schritt 4
replaces it with a serialising mutex; Schritt 5 adds VRAM-pressure
eviction. The interface boundary means server.go stays unchanged.
Out of scope here:
- Schritt 3: wa.sh migration (parallel work in mAi).
- Schritt 4: queue + global GPU lock.
- Schritt 5: nvidia-smi-driven LRU eviction.
Tests: config validation (good/bad), proxy forwards body, audio proxy
streams bytes, unhealthy consumer returns 503, /v1/status JSON shape.
Refs: m/mGPUmanager#1
118 lines
2.8 KiB
Go
118 lines
2.8 KiB
Go
// Package gpu polls nvidia-smi for live VRAM usage.
|
|
//
|
|
// Schritt 5 uses this to detect VRAM pressure and trigger LRU eviction.
|
|
// On hosts without an NVIDIA GPU (e.g. m's laptop during local dev) the
|
|
// poller silently reports zero usage so the scheduler can still run.
|
|
package gpu
|
|
|
|
import (
|
|
"context"
|
|
"log/slog"
|
|
"os/exec"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
)
|
|
|
|
// Sample is one nvidia-smi reading.
|
|
type Sample struct {
|
|
UsedMiB int
|
|
FreeMiB int
|
|
TotalMiB int
|
|
At time.Time
|
|
Err string
|
|
}
|
|
|
|
// Poller periodically samples GPU memory and exposes the latest reading.
|
|
type Poller struct {
|
|
interval time.Duration
|
|
logger *slog.Logger
|
|
mu sync.RWMutex
|
|
last Sample
|
|
}
|
|
|
|
// NewPoller builds a Poller. Pass the desired sampling cadence.
|
|
func NewPoller(interval time.Duration, logger *slog.Logger) *Poller {
|
|
if interval <= 0 {
|
|
interval = 2 * time.Second
|
|
}
|
|
return &Poller{interval: interval, logger: logger}
|
|
}
|
|
|
|
// Run samples in a loop until ctx is cancelled.
|
|
func (p *Poller) Run(ctx context.Context) {
|
|
p.sampleOnce(ctx)
|
|
t := time.NewTicker(p.interval)
|
|
defer t.Stop()
|
|
for {
|
|
select {
|
|
case <-ctx.Done():
|
|
return
|
|
case <-t.C:
|
|
p.sampleOnce(ctx)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Last returns the most recent sample.
|
|
func (p *Poller) Last() Sample {
|
|
p.mu.RLock()
|
|
defer p.mu.RUnlock()
|
|
return p.last
|
|
}
|
|
|
|
func (p *Poller) sampleOnce(ctx context.Context) {
|
|
cctx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
|
defer cancel()
|
|
|
|
// memory.used,memory.free,memory.total in MiB, no units, no header.
|
|
cmd := exec.CommandContext(cctx, "nvidia-smi",
|
|
"--query-gpu=memory.used,memory.free,memory.total",
|
|
"--format=csv,noheader,nounits")
|
|
out, err := cmd.Output()
|
|
now := time.Now()
|
|
if err != nil {
|
|
p.store(Sample{At: now, Err: err.Error()})
|
|
if p.logger != nil {
|
|
p.logger.Debug("nvidia-smi failed", "err", err)
|
|
}
|
|
return
|
|
}
|
|
used, free, total, perr := parseSMI(string(out))
|
|
if perr != "" {
|
|
p.store(Sample{At: now, Err: perr})
|
|
return
|
|
}
|
|
p.store(Sample{UsedMiB: used, FreeMiB: free, TotalMiB: total, At: now})
|
|
}
|
|
|
|
func (p *Poller) store(s Sample) {
|
|
p.mu.Lock()
|
|
p.last = s
|
|
p.mu.Unlock()
|
|
}
|
|
|
|
func parseSMI(out string) (used, free, total int, errMsg string) {
|
|
// Take first non-empty line — multi-GPU hosts would yield more, but we
|
|
// only support single-GPU (mRock) for Phase 1.
|
|
for line := range strings.SplitSeq(out, "\n") {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
continue
|
|
}
|
|
parts := strings.Split(line, ",")
|
|
if len(parts) != 3 {
|
|
return 0, 0, 0, "unexpected nvidia-smi output: " + line
|
|
}
|
|
u, e1 := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
f, e2 := strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
t, e3 := strconv.Atoi(strings.TrimSpace(parts[2]))
|
|
if e1 != nil || e2 != nil || e3 != nil {
|
|
return 0, 0, 0, "non-integer nvidia-smi output: " + line
|
|
}
|
|
return u, f, t, ""
|
|
}
|
|
return 0, 0, 0, "empty nvidia-smi output"
|
|
}
|