scheduler.Evicting wraps the Locked scheduler with the design's LRU-with-coexistence eviction loop. main.go switches to it. Per-job flow: 1. ensureFits — compare cons.vram_resident_mib + 256 MiB cushion against the live nvidia-smi free reading. If insufficient, pick the LRU loaded consumer NOT in cons.can_coexist_with, NOT VRAM-managed (ollama is excluded from eviction by design — it runs its own LRU), and NOT the target itself, then call its unload route. Wait 1s for VRAM to actually free. Repeat up to 5 times. 2. ensureLoaded — if the target was previously unloaded, call its /api/admin/load (mvoice). Consumers without a load route are assumed to cold-start implicitly on first request. 3. inner.Run — global GPU lock + job execution. State: - scheduler-local 'loaded' map + scheduler-local 'lastUsed' map. The registry's health-derived Loaded field is the source of truth for consumers that report it, but we need our own state for the seconds between an unload call and the next probe. - Stats.Evictions counts successful unload calls and surfaces through /v1/status. LRU pick order: - Scheduler-local lastUsed (set on successful Run completion) takes precedence over registry.LastUsed (set on health probes) because the former reflects real GPU work, not health chatter. Zero-time consumers (never used) lose first. Tests: - Already-resident target: no eviction calls. - 13 GiB comfyui evicted to fit 2.8 GiB mvoice → 1 unload + 1 load, Stats.Evictions = 1. - Coexistent consumer (ollama, in mvoice.can_coexist_with) is never picked even if it's the LRU candidate; the non-coexistent comfyui is unloaded instead. Race detector clean. Refs: m/mGPUmanager#1 (Schritt 5).
107 lines
3.0 KiB
Go
107 lines
3.0 KiB
Go
// mgpumanager is the GPU-Inference-Control-Plane for mRock.
|
|
//
|
|
// One Go binary that:
|
|
// 1. Loads consumers.yaml.
|
|
// 2. Probes every consumer's /health on a 5s cadence.
|
|
// 3. Polls nvidia-smi every 2s for live VRAM usage (used by Schritt 5
|
|
// eviction).
|
|
// 4. Exposes /v1/{tts,stt,llm,image} as a thin proxy + /v1/status for
|
|
// observability.
|
|
// 5. Funnels every job through the Scheduler (passthrough today, queue +
|
|
// eviction in Schritt 4-5).
|
|
//
|
|
// All client routing happens through this daemon — no consumer is reached
|
|
// directly any more. wa.sh, ImaGen, m-CLI and Furbotto-Voice will all speak
|
|
// to :8770/v1/*.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"flag"
|
|
"log/slog"
|
|
"net/http"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/gpu"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/scheduler"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/server"
|
|
)
|
|
|
|
func main() {
|
|
configPath := flag.String("config", "config/consumers.yaml", "path to consumers.yaml")
|
|
listenOverride := flag.String("listen", "", "override listen address from config")
|
|
logLevel := flag.String("log-level", "info", "log level: debug|info|warn|error")
|
|
flag.Parse()
|
|
|
|
logger := newLogger(*logLevel)
|
|
|
|
cfg, err := config.Load(*configPath)
|
|
if err != nil {
|
|
logger.Error("config load failed", "err", err, "path", *configPath)
|
|
os.Exit(1)
|
|
}
|
|
if *listenOverride != "" {
|
|
cfg.Listen = *listenOverride
|
|
}
|
|
|
|
logger.Info("starting mGPUmanager",
|
|
"listen", cfg.Listen,
|
|
"consumers", len(cfg.Consumers),
|
|
"poll_interval", cfg.GPU.PollInterval(),
|
|
)
|
|
|
|
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
|
defer cancel()
|
|
|
|
reg := registry.New(cfg, logger.With("component", "registry"))
|
|
gpuPoller := gpu.NewPoller(cfg.GPU.PollInterval(), logger.With("component", "gpu"))
|
|
// Schritt 5: VRAM-pressure-aware scheduler. Wraps the global GPU lock
|
|
// with eviction logic — see internal/scheduler/evicting.go.
|
|
sched := scheduler.NewEvicting(cfg, reg, gpuPoller,
|
|
logger.With("component", "scheduler"))
|
|
|
|
go reg.Run(ctx)
|
|
go gpuPoller.Run(ctx)
|
|
|
|
srv := server.New(cfg, reg, gpuPoller, sched, logger.With("component", "server"))
|
|
httpSrv := &http.Server{
|
|
Addr: cfg.Listen,
|
|
Handler: srv.Handler(),
|
|
ReadHeaderTimeout: 10 * time.Second,
|
|
}
|
|
|
|
go func() {
|
|
<-ctx.Done()
|
|
shutCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
|
defer cancel()
|
|
_ = httpSrv.Shutdown(shutCtx)
|
|
}()
|
|
|
|
if err := httpSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
|
|
logger.Error("listen failed", "err", err)
|
|
os.Exit(1)
|
|
}
|
|
logger.Info("shutdown complete")
|
|
}
|
|
|
|
func newLogger(level string) *slog.Logger {
|
|
var lvl slog.Level
|
|
switch level {
|
|
case "debug":
|
|
lvl = slog.LevelDebug
|
|
case "warn":
|
|
lvl = slog.LevelWarn
|
|
case "error":
|
|
lvl = slog.LevelError
|
|
default:
|
|
lvl = slog.LevelInfo
|
|
}
|
|
return slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: lvl}))
|
|
}
|