Go daemon listening on :8770 that fronts mvoice (8766), whisper-server
(8178), ollama (11434), comfyui (8188) behind a single /v1 façade.
What this MVP does:
- Loads config/consumers.yaml: routing table, per-consumer URL + health +
paths + vram_resident_mib + can_coexist_with + load/unload routes.
- Background health probe (5s) on every consumer; refuses fast with a
structured 503 if the last probe failed (no Felix-Banholzer-style
silent fallback).
- POST /v1/{tts,stt,llm,image} proxies the request body + Content-Type
to the routed consumer's path and streams the response back.
- GET /audio/* proxies to audio_proxy consumer (wa.sh fetches its WAV
this way).
- GET /v1/status exposes live GPU sample (nvidia-smi every 2s),
per-consumer health/loaded/gpu_resident_mib/active/total_requests,
scheduler stats.
- GET /healthz, GET / — broker liveness.
The Scheduler interface is in place but the implementation is
'Passthrough' — every job runs immediately, no lock, no queue. Schritt 4
replaces it with a serialising mutex; Schritt 5 adds VRAM-pressure
eviction. The interface boundary means server.go stays unchanged.
Out of scope here:
- Schritt 3: wa.sh migration (parallel work in mAi).
- Schritt 4: queue + global GPU lock.
- Schritt 5: nvidia-smi-driven LRU eviction.
Tests: config validation (good/bad), proxy forwards body, audio proxy
streams bytes, unhealthy consumer returns 503, /v1/status JSON shape.
Refs: m/mGPUmanager#1
166 lines
5.0 KiB
Go
166 lines
5.0 KiB
Go
// Package config loads the mGPUmanager consumer registry from YAML.
|
||
//
|
||
// The consumers.yaml file declares every GPU consumer (mvoice, whisper-server,
|
||
// ollama, comfyui), how to route the four logical endpoint kinds (tts, stt,
|
||
// llm, image) to a consumer, how to probe its health, and how to load/unload
|
||
// it from VRAM. The scheduler (Schritt 4–5) reads vram_resident_mib +
|
||
// can_coexist_with to drive eviction.
|
||
package config
|
||
|
||
import (
|
||
"fmt"
|
||
"net/url"
|
||
"os"
|
||
"strings"
|
||
"time"
|
||
|
||
"gopkg.in/yaml.v3"
|
||
)
|
||
|
||
// EndpointKind enumerates the four logical broker endpoints exposed on /v1/*.
|
||
type EndpointKind string
|
||
|
||
const (
|
||
KindTTS EndpointKind = "tts"
|
||
KindSTT EndpointKind = "stt"
|
||
KindLLM EndpointKind = "llm"
|
||
KindImage EndpointKind = "image"
|
||
)
|
||
|
||
// AllKinds is the canonical ordering used by /v1/status and tests.
|
||
var AllKinds = []EndpointKind{KindTTS, KindSTT, KindLLM, KindImage}
|
||
|
||
// Route describes an HTTP method + path on a consumer.
|
||
type Route struct {
|
||
Method string `yaml:"method"`
|
||
Path string `yaml:"path"`
|
||
// Body is an optional fixed request body for admin operations
|
||
// (e.g. ComfyUI's /api/free expects {"unload_models":true,"free_memory":true}).
|
||
Body string `yaml:"body,omitempty"`
|
||
}
|
||
|
||
// Consumer describes a single GPU consumer behind the broker.
|
||
type Consumer struct {
|
||
URL string `yaml:"url"`
|
||
Health Route `yaml:"health"`
|
||
Paths map[EndpointKind]Route `yaml:"paths"`
|
||
VRAMResidentMiB int `yaml:"vram_resident_mib"`
|
||
VRAMManaged bool `yaml:"vram_managed"` // self-managed LRU (ollama)
|
||
Load *Route `yaml:"load,omitempty"`
|
||
Unload *Route `yaml:"unload,omitempty"`
|
||
SystemdUnit string `yaml:"systemd_unit,omitempty"` // fallback unload (whisper-server)
|
||
CanCoexistWith []string `yaml:"can_coexist_with"`
|
||
Priority int `yaml:"priority"`
|
||
MaxConcurrency int `yaml:"max_concurrency"`
|
||
}
|
||
|
||
// GPU describes the host's GPU envelope.
|
||
type GPU struct {
|
||
TotalMiB int `yaml:"total_mib"`
|
||
ReservedMiB int `yaml:"reserved_mib"`
|
||
PollIntervalSeconds int `yaml:"poll_interval_seconds"`
|
||
}
|
||
|
||
// PollInterval returns the GPU polling cadence as a Duration. Defaults to 2s.
|
||
func (g GPU) PollInterval() time.Duration {
|
||
if g.PollIntervalSeconds <= 0 {
|
||
return 2 * time.Second
|
||
}
|
||
return time.Duration(g.PollIntervalSeconds) * time.Second
|
||
}
|
||
|
||
// AvailableMiB returns total VRAM minus the system-reserved headroom.
|
||
func (g GPU) AvailableMiB() int {
|
||
if g.TotalMiB <= 0 {
|
||
return 0
|
||
}
|
||
avail := g.TotalMiB - g.ReservedMiB
|
||
if avail < 0 {
|
||
return 0
|
||
}
|
||
return avail
|
||
}
|
||
|
||
// Config is the parsed mGPUmanager configuration.
|
||
type Config struct {
|
||
Listen string `yaml:"listen"`
|
||
GPU GPU `yaml:"gpu"`
|
||
Routing map[EndpointKind]string `yaml:"routing"`
|
||
AudioProxy string `yaml:"audio_proxy"`
|
||
Consumers map[string]*Consumer `yaml:"consumers"`
|
||
}
|
||
|
||
// Load reads and validates a consumers.yaml file from disk.
|
||
func Load(path string) (*Config, error) {
|
||
b, err := os.ReadFile(path)
|
||
if err != nil {
|
||
return nil, fmt.Errorf("read %s: %w", path, err)
|
||
}
|
||
var cfg Config
|
||
if err := yaml.Unmarshal(b, &cfg); err != nil {
|
||
return nil, fmt.Errorf("parse %s: %w", path, err)
|
||
}
|
||
if err := cfg.validate(); err != nil {
|
||
return nil, fmt.Errorf("validate %s: %w", path, err)
|
||
}
|
||
return &cfg, nil
|
||
}
|
||
|
||
func (c *Config) validate() error {
|
||
if c.Listen == "" {
|
||
c.Listen = "127.0.0.1:8770"
|
||
}
|
||
if len(c.Consumers) == 0 {
|
||
return fmt.Errorf("no consumers declared")
|
||
}
|
||
for name, cons := range c.Consumers {
|
||
if cons.URL == "" {
|
||
return fmt.Errorf("consumer %q: url is required", name)
|
||
}
|
||
if _, err := url.Parse(cons.URL); err != nil {
|
||
return fmt.Errorf("consumer %q: invalid url %q: %w", name, cons.URL, err)
|
||
}
|
||
if cons.Health.Path == "" {
|
||
return fmt.Errorf("consumer %q: health.path is required", name)
|
||
}
|
||
if cons.Health.Method == "" {
|
||
cons.Health.Method = "GET"
|
||
}
|
||
cons.Health.Method = strings.ToUpper(cons.Health.Method)
|
||
for kind, route := range cons.Paths {
|
||
if route.Path == "" {
|
||
return fmt.Errorf("consumer %q: paths.%s.path is required", name, kind)
|
||
}
|
||
if route.Method == "" {
|
||
route.Method = "POST"
|
||
}
|
||
route.Method = strings.ToUpper(route.Method)
|
||
cons.Paths[kind] = route
|
||
}
|
||
if cons.MaxConcurrency <= 0 {
|
||
cons.MaxConcurrency = 1
|
||
}
|
||
}
|
||
for kind, consName := range c.Routing {
|
||
if _, ok := c.Consumers[consName]; !ok {
|
||
return fmt.Errorf("routing.%s: unknown consumer %q", kind, consName)
|
||
}
|
||
}
|
||
if c.AudioProxy != "" {
|
||
if _, ok := c.Consumers[c.AudioProxy]; !ok {
|
||
return fmt.Errorf("audio_proxy: unknown consumer %q", c.AudioProxy)
|
||
}
|
||
}
|
||
return nil
|
||
}
|
||
|
||
// ConsumerForKind returns the consumer designated to handle a given endpoint
|
||
// kind, or nil if routing is unset.
|
||
func (c *Config) ConsumerForKind(kind EndpointKind) (string, *Consumer) {
|
||
name, ok := c.Routing[kind]
|
||
if !ok {
|
||
return "", nil
|
||
}
|
||
return name, c.Consumers[name]
|
||
}
|