From 468317e395ffe2235eb3276ca9b53756060fa43b Mon Sep 17 00:00:00 2001 From: mAi Date: Fri, 15 May 2026 16:54:11 +0200 Subject: [PATCH] fix(scheduler): mark lazy consumers (Unload but no Load) as not-loaded at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Live deploy on mRock surfaced a Schritt 5 bug: comfyui was always treated as preloaded at scheduler startup, which made ensureFits() short-circuit on the very first /v1/image request — exactly the scenario eviction is supposed to handle. mvoice was never picked as a victim, ComfyUI then OOM'd loading FLUX on top of the still-resident mvoice. Fix: replace the blanket 'every consumer starts loaded' init with a heuristic — initialLoaded(cons): - VRAMManaged (ollama): true. We never track/evict it; the consumer runs its own LRU. - Load+Unload both present (mvoice): true. Designed to be controllable; typically preloads in its own lifespan. - Unload only, no Load (comfyui): false. Lazy — FLUX isn't resident until the first /prompt, so we shouldn't bill its 13 GiB against the GPU budget until then. - SystemdUnit only (whisper-server): true. Always-on, model loaded at process start. - Empty: true. Safe fallback. Verified live on mRock (2026-05-15): Before /v1/image: nvidia-smi 8963 MiB used; mvoice gpu_resident_mib 2345 POST /v1/image: HTTP 400 from upstream (empty workflow), broker did trigger eviction before forwarding After: nvidia-smi 6547 MiB used; mvoice gpu_resident_mib 9 (~CUDA context only); scheduler.evictions = 2 POST /v1/tts: audio_url returned, tts_ms 670, audio 3.5 s After reload: nvidia-smi 8943 MiB used; mvoice gpu_resident_mib 2917 Test: TestInitialLoadedHeuristic pins the four cases down so this doesn't regress when someone adds a fifth consumer type. Refs: m/mGPUmanager#1 (live deploy). --- internal/scheduler/evicting.go | 31 ++++++++++++++++++++++++++--- internal/scheduler/evicting_test.go | 31 +++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/internal/scheduler/evicting.go b/internal/scheduler/evicting.go index e0813df..3806709 100644 --- a/internal/scheduler/evicting.go +++ b/internal/scheduler/evicting.go @@ -70,13 +70,38 @@ func NewEvicting(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poll lastUsed: make(map[string]time.Time, len(cfg.Consumers)), } for name, cons := range cfg.Consumers { - // Self-managed VRAM consumers (ollama) are always 'loaded' from - // the scheduler's perspective — we never evict them via HTTP. - e.loaded[name] = !cons.VRAMManaged || true + e.loaded[name] = initialLoaded(cons) } return e } +// initialLoaded picks the believed-loaded state for a consumer at scheduler +// startup. The rule: +// +// - VRAM-managed (ollama): true — we never track or evict it. +// - Has a load route AND an unload route (mvoice): true — the consumer +// is set up to be controllable in both directions, and typically +// preloads on its own systemd-managed startup. +// - Has only an unload route, no load route (comfyui): false — lazy. +// FLUX isn't resident until the first /prompt; until that happens we +// don't account for its VRAM cost. +// - Has a systemd_unit but no HTTP routes (whisper-server): true — these +// are always-on services that load their model at process start. +// - Neither: true — fallback, assume it's there if the consumer is up. +// +// Getting this right matters for the eviction smoke test: if comfyui were +// believed loaded at startup, ensureFits would short-circuit on the first +// /v1/image request and never trigger eviction. (m/mGPUmanager#1 live deploy.) +func initialLoaded(cons *config.Consumer) bool { + if cons.VRAMManaged { + return true + } + if cons.Load == nil && cons.Unload != nil { + return false + } + return true +} + // Run is the public Scheduler interface: ensure room + load + serialise. func (e *Evicting) Run(ctx context.Context, consumer string, fn Job) error { if err := e.ensureFits(ctx, consumer); err != nil { diff --git a/internal/scheduler/evicting_test.go b/internal/scheduler/evicting_test.go index 153e01a..01dc839 100644 --- a/internal/scheduler/evicting_test.go +++ b/internal/scheduler/evicting_test.go @@ -118,6 +118,37 @@ func buildCfg(mvoiceURL, comfyURL string) *config.Config { } } +// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down: +// a consumer with Unload but no Load is lazy; everything else is assumed +// resident at startup. +func TestInitialLoadedHeuristic(t *testing.T) { + cases := []struct { + name string + cons *config.Consumer + want bool + }{ + {"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true}, + {"load+unload (mvoice)", &config.Consumer{ + Load: &config.Route{Path: "/load"}, + Unload: &config.Route{Path: "/unload"}, + }, true}, + {"unload only — lazy (comfyui)", &config.Consumer{ + Unload: &config.Route{Path: "/api/free"}, + }, false}, + {"systemd unit only (whisper-server)", &config.Consumer{ + SystemdUnit: "whisper-server.service", + }, true}, + {"empty consumer", &config.Consumer{}, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := initialLoaded(c.cons); got != c.want { + t.Errorf("initialLoaded = %v, want %v", got, c.want) + } + }) + } +} + // TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job // for an already-loaded consumer with plenty of free VRAM runs without any // unload call.