From 468317e395ffe2235eb3276ca9b53756060fa43b Mon Sep 17 00:00:00 2001
From: mAi <mai@flexsiebels.de>
Date: Fri, 15 May 2026 16:54:11 +0200
Subject: [PATCH] fix(scheduler): mark lazy consumers (Unload but no Load) as
 not-loaded at startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Live deploy on mRock surfaced a Schritt 5 bug: comfyui was always
treated as preloaded at scheduler startup, which made ensureFits()
short-circuit on the very first /v1/image request — exactly the
scenario eviction is supposed to handle. mvoice was never picked as
a victim, ComfyUI then OOM'd loading FLUX on top of the still-resident
mvoice.

Fix: replace the blanket 'every consumer starts loaded' init with a
heuristic — initialLoaded(cons):

  - VRAMManaged (ollama): true. We never track/evict it; the consumer
    runs its own LRU.
  - Load+Unload both present (mvoice): true. Designed to be controllable;
    typically preloads in its own lifespan.
  - Unload only, no Load (comfyui): false. Lazy — FLUX isn't resident
    until the first /prompt, so we shouldn't bill its 13 GiB against the
    GPU budget until then.
  - SystemdUnit only (whisper-server): true. Always-on, model loaded at
    process start.
  - Empty: true. Safe fallback.

Verified live on mRock (2026-05-15):

  Before /v1/image:  nvidia-smi 8963 MiB used; mvoice gpu_resident_mib 2345
  POST /v1/image:    HTTP 400 from upstream (empty workflow), broker did
                     trigger eviction before forwarding
  After:             nvidia-smi 6547 MiB used; mvoice gpu_resident_mib 9
                     (~CUDA context only); scheduler.evictions = 2
  POST /v1/tts:      audio_url returned, tts_ms 670, audio 3.5 s
  After reload:      nvidia-smi 8943 MiB used; mvoice gpu_resident_mib 2917

Test: TestInitialLoadedHeuristic pins the four cases down so this
doesn't regress when someone adds a fifth consumer type.

Refs: m/mGPUmanager#1 (live deploy).
---
 internal/scheduler/evicting.go      | 31 ++++++++++++++++++++++++++---
 internal/scheduler/evicting_test.go | 31 +++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/internal/scheduler/evicting.go b/internal/scheduler/evicting.go
index e0813df..3806709 100644
--- a/internal/scheduler/evicting.go
+++ b/internal/scheduler/evicting.go
@@ -70,13 +70,38 @@ func NewEvicting(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poll
 		lastUsed: make(map[string]time.Time, len(cfg.Consumers)),
 	}
 	for name, cons := range cfg.Consumers {
-		// Self-managed VRAM consumers (ollama) are always 'loaded' from
-		// the scheduler's perspective — we never evict them via HTTP.
-		e.loaded[name] = !cons.VRAMManaged || true
+		e.loaded[name] = initialLoaded(cons)
 	}
 	return e
 }
 
+// initialLoaded picks the believed-loaded state for a consumer at scheduler
+// startup. The rule:
+//
+//   - VRAM-managed (ollama): true — we never track or evict it.
+//   - Has a load route AND an unload route (mvoice): true — the consumer
+//     is set up to be controllable in both directions, and typically
+//     preloads on its own systemd-managed startup.
+//   - Has only an unload route, no load route (comfyui): false — lazy.
+//     FLUX isn't resident until the first /prompt; until that happens we
+//     don't account for its VRAM cost.
+//   - Has a systemd_unit but no HTTP routes (whisper-server): true — these
+//     are always-on services that load their model at process start.
+//   - Neither: true — fallback, assume it's there if the consumer is up.
+//
+// Getting this right matters for the eviction smoke test: if comfyui were
+// believed loaded at startup, ensureFits would short-circuit on the first
+// /v1/image request and never trigger eviction. (m/mGPUmanager#1 live deploy.)
+func initialLoaded(cons *config.Consumer) bool {
+	if cons.VRAMManaged {
+		return true
+	}
+	if cons.Load == nil && cons.Unload != nil {
+		return false
+	}
+	return true
+}
+
 // Run is the public Scheduler interface: ensure room + load + serialise.
 func (e *Evicting) Run(ctx context.Context, consumer string, fn Job) error {
 	if err := e.ensureFits(ctx, consumer); err != nil {
diff --git a/internal/scheduler/evicting_test.go b/internal/scheduler/evicting_test.go
index 153e01a..01dc839 100644
--- a/internal/scheduler/evicting_test.go
+++ b/internal/scheduler/evicting_test.go
@@ -118,6 +118,37 @@ func buildCfg(mvoiceURL, comfyURL string) *config.Config {
 	}
 }
 
+// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down:
+// a consumer with Unload but no Load is lazy; everything else is assumed
+// resident at startup.
+func TestInitialLoadedHeuristic(t *testing.T) {
+	cases := []struct {
+		name string
+		cons *config.Consumer
+		want bool
+	}{
+		{"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true},
+		{"load+unload (mvoice)", &config.Consumer{
+			Load:   &config.Route{Path: "/load"},
+			Unload: &config.Route{Path: "/unload"},
+		}, true},
+		{"unload only — lazy (comfyui)", &config.Consumer{
+			Unload: &config.Route{Path: "/api/free"},
+		}, false},
+		{"systemd unit only (whisper-server)", &config.Consumer{
+			SystemdUnit: "whisper-server.service",
+		}, true},
+		{"empty consumer", &config.Consumer{}, true},
+	}
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			if got := initialLoaded(c.cons); got != c.want {
+				t.Errorf("initialLoaded = %v, want %v", got, c.want)
+			}
+		})
+	}
+}
+
 // TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job
 // for an already-loaded consumer with plenty of free VRAM runs without any
 // unload call.