diff --git a/Makefile b/Makefile index 370aaa3..2453639 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ # `make build` — compile the Go binary into ./bin/mgpumanager. # `make test` — go test ./... # `make run` — run locally against ./config/consumers.yaml. -# `make deploy` — rsync binary + config + systemd unit to mRock, -# reload systemd, restart the service. +# `make deploy` — rsync binary + config + user-unit to mRock and +# (re)start it under `systemctl --user`. BIN := bin/mgpumanager PKG := ./cmd/mgpumanager @@ -12,6 +12,7 @@ PKG := ./cmd/mgpumanager GO ?= go HOST ?= mrock REMOTE_DIR ?= /home/m/dev/mGPUmanager +USER_UNIT_DIR ?= /home/m/.config/systemd/user .PHONY: build test run deploy clean @@ -25,11 +26,13 @@ test: run: build ./$(BIN) --config config/consumers.yaml --log-level debug +# Deploys to mRock as a user unit (systemd --user). User lingering must +# be enabled on the target host: `sudo loginctl enable-linger m`. deploy: build rsync -a --mkpath $(BIN) $(HOST):$(REMOTE_DIR)/$(BIN) rsync -a --mkpath config/consumers.yaml $(HOST):$(REMOTE_DIR)/config/consumers.yaml - rsync -a --mkpath systemd/mgpumanager.service $(HOST):$(REMOTE_DIR)/systemd/mgpumanager.service - ssh $(HOST) "sudo cp $(REMOTE_DIR)/systemd/mgpumanager.service /etc/systemd/system/mgpumanager.service && sudo systemctl daemon-reload && sudo systemctl enable mgpumanager.service && sudo systemctl restart mgpumanager.service && sudo systemctl status mgpumanager.service --no-pager -l" + rsync -a --mkpath systemd/mgpumanager.service $(HOST):$(USER_UNIT_DIR)/mgpumanager.service + ssh $(HOST) "systemctl --user daemon-reload && systemctl --user enable mgpumanager.service && systemctl --user restart mgpumanager.service && systemctl --user status mgpumanager.service --no-pager -l" clean: rm -rf bin diff --git a/config/consumers.yaml b/config/consumers.yaml index 4857c63..1792392 100644 --- a/config/consumers.yaml +++ b/config/consumers.yaml @@ -1,4 +1,4 @@ -listen: 127.0.0.1:8770 +listen: 0.0.0.0:8770 gpu: total_mib: 16376 # RTX 4070 Ti SUPER diff --git a/internal/scheduler/evicting.go b/internal/scheduler/evicting.go index e0813df..3806709 100644 --- a/internal/scheduler/evicting.go +++ b/internal/scheduler/evicting.go @@ -70,13 +70,38 @@ func NewEvicting(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poll lastUsed: make(map[string]time.Time, len(cfg.Consumers)), } for name, cons := range cfg.Consumers { - // Self-managed VRAM consumers (ollama) are always 'loaded' from - // the scheduler's perspective — we never evict them via HTTP. - e.loaded[name] = !cons.VRAMManaged || true + e.loaded[name] = initialLoaded(cons) } return e } +// initialLoaded picks the believed-loaded state for a consumer at scheduler +// startup. The rule: +// +// - VRAM-managed (ollama): true — we never track or evict it. +// - Has a load route AND an unload route (mvoice): true — the consumer +// is set up to be controllable in both directions, and typically +// preloads on its own systemd-managed startup. +// - Has only an unload route, no load route (comfyui): false — lazy. +// FLUX isn't resident until the first /prompt; until that happens we +// don't account for its VRAM cost. +// - Has a systemd_unit but no HTTP routes (whisper-server): true — these +// are always-on services that load their model at process start. +// - Neither: true — fallback, assume it's there if the consumer is up. +// +// Getting this right matters for the eviction smoke test: if comfyui were +// believed loaded at startup, ensureFits would short-circuit on the first +// /v1/image request and never trigger eviction. (m/mGPUmanager#1 live deploy.) +func initialLoaded(cons *config.Consumer) bool { + if cons.VRAMManaged { + return true + } + if cons.Load == nil && cons.Unload != nil { + return false + } + return true +} + // Run is the public Scheduler interface: ensure room + load + serialise. func (e *Evicting) Run(ctx context.Context, consumer string, fn Job) error { if err := e.ensureFits(ctx, consumer); err != nil { diff --git a/internal/scheduler/evicting_test.go b/internal/scheduler/evicting_test.go index 153e01a..01dc839 100644 --- a/internal/scheduler/evicting_test.go +++ b/internal/scheduler/evicting_test.go @@ -118,6 +118,37 @@ func buildCfg(mvoiceURL, comfyURL string) *config.Config { } } +// TestInitialLoadedHeuristic pins the comfyui-isn't-preloaded rule down: +// a consumer with Unload but no Load is lazy; everything else is assumed +// resident at startup. +func TestInitialLoadedHeuristic(t *testing.T) { + cases := []struct { + name string + cons *config.Consumer + want bool + }{ + {"vram_managed (ollama)", &config.Consumer{VRAMManaged: true}, true}, + {"load+unload (mvoice)", &config.Consumer{ + Load: &config.Route{Path: "/load"}, + Unload: &config.Route{Path: "/unload"}, + }, true}, + {"unload only — lazy (comfyui)", &config.Consumer{ + Unload: &config.Route{Path: "/api/free"}, + }, false}, + {"systemd unit only (whisper-server)", &config.Consumer{ + SystemdUnit: "whisper-server.service", + }, true}, + {"empty consumer", &config.Consumer{}, true}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := initialLoaded(c.cons); got != c.want { + t.Errorf("initialLoaded = %v, want %v", got, c.want) + } + }) + } +} + // TestEvictingSkipsWhenAlreadyResident verifies the no-op fast path: a job // for an already-loaded consumer with plenty of free VRAM runs without any // unload call. diff --git a/systemd/mgpumanager.service b/systemd/mgpumanager.service index 2a07b53..d9b6c47 100644 --- a/systemd/mgpumanager.service +++ b/systemd/mgpumanager.service @@ -1,30 +1,15 @@ [Unit] -Description=mGPUmanager — GPU-Inference-Control-Plane for mRock +Description=mGPUmanager — GPU-Inference-Control-Plane Documentation=https://mgit.msbls.de/m/mGPUmanager -After=network-online.target -Wants=network-online.target +After=network.target [Service] Type=simple -User=m -Group=m -WorkingDirectory=/home/m/dev/mGPUmanager -ExecStart=/home/m/dev/mGPUmanager/bin/mgpumanager \ - --config /home/m/dev/mGPUmanager/config/consumers.yaml \ - --log-level info +WorkingDirectory=%h/dev/mGPUmanager +ExecStart=%h/dev/mGPUmanager/bin/mgpumanager --config %h/dev/mGPUmanager/config/consumers.yaml --log-level info Restart=on-failure -RestartSec=3 +RestartSec=5 TimeoutStopSec=10 -# Hardening — broker has no need for elevated capabilities. -NoNewPrivileges=true -PrivateTmp=true -ProtectSystem=strict -ProtectHome=read-only -ReadWritePaths=/home/m/dev/mGPUmanager - -# The broker only proxies; nvidia-smi is the only GPU-touching call. -PrivateDevices=false - [Install] -WantedBy=multi-user.target +WantedBy=default.target