Go daemon listening on :8770 that fronts mvoice (8766), whisper-server
(8178), ollama (11434), comfyui (8188) behind a single /v1 façade.
What this MVP does:
- Loads config/consumers.yaml: routing table, per-consumer URL + health +
paths + vram_resident_mib + can_coexist_with + load/unload routes.
- Background health probe (5s) on every consumer; refuses fast with a
structured 503 if the last probe failed (no Felix-Banholzer-style
silent fallback).
- POST /v1/{tts,stt,llm,image} proxies the request body + Content-Type
to the routed consumer's path and streams the response back.
- GET /audio/* proxies to audio_proxy consumer (wa.sh fetches its WAV
this way).
- GET /v1/status exposes live GPU sample (nvidia-smi every 2s),
per-consumer health/loaded/gpu_resident_mib/active/total_requests,
scheduler stats.
- GET /healthz, GET / — broker liveness.
The Scheduler interface is in place but the implementation is
'Passthrough' — every job runs immediately, no lock, no queue. Schritt 4
replaces it with a serialising mutex; Schritt 5 adds VRAM-pressure
eviction. The interface boundary means server.go stays unchanged.
Out of scope here:
- Schritt 3: wa.sh migration (parallel work in mAi).
- Schritt 4: queue + global GPU lock.
- Schritt 5: nvidia-smi-driven LRU eviction.
Tests: config validation (good/bad), proxy forwards body, audio proxy
streams bytes, unhealthy consumer returns 503, /v1/status JSON shape.
Refs: m/mGPUmanager#1
113 lines
3.3 KiB
Go
113 lines
3.3 KiB
Go
// Package scheduler controls who gets the GPU when.
|
|
//
|
|
// Three responsibilities, added in three phases:
|
|
//
|
|
// - Schritt 2 (this file's first version): a passthrough — every job runs
|
|
// immediately, no locking, no queueing. Only useful for proving the HTTP
|
|
// façade end-to-end.
|
|
// - Schritt 4: a global mutex (or capacity-1 channel) serialises all GPU
|
|
// work. Per-consumer max_concurrency limits stay at 1 for now.
|
|
// - Schritt 5: VRAM-pressure-aware eviction kicks in before acquire when the
|
|
// requested consumer's resident cost would exceed available headroom.
|
|
//
|
|
// The interface deliberately hides which phase is active from callers
|
|
// (server.go) so the upgrade path is local to this package.
|
|
package scheduler
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"sync"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
)
|
|
|
|
// ErrSchedulerStopped is returned if Run is called after Close.
|
|
var ErrSchedulerStopped = errors.New("scheduler stopped")
|
|
|
|
// Job is what a consumer route worker executes while holding the GPU lock.
|
|
type Job func(ctx context.Context) error
|
|
|
|
// Scheduler decides when GPU work runs. Implementations may queue, serialise,
|
|
// or evict other consumers before granting access.
|
|
type Scheduler interface {
|
|
// Run executes fn while the caller holds the right to use the GPU for
|
|
// the named consumer. It blocks until fn returns or ctx is cancelled.
|
|
Run(ctx context.Context, consumer string, fn Job) error
|
|
|
|
// Stats returns a snapshot of scheduler internals for /v1/status.
|
|
Stats() Stats
|
|
}
|
|
|
|
// Stats is what /v1/status reports about the scheduler.
|
|
type Stats struct {
|
|
QueueDepth int `json:"queue_depth"`
|
|
InFlight int `json:"in_flight"`
|
|
TotalJobs int64 `json:"total_jobs"`
|
|
LastWaitMS int64 `json:"last_wait_ms"`
|
|
LastRunMS int64 `json:"last_run_ms"`
|
|
Evictions int64 `json:"evictions"`
|
|
OldestQueued time.Time `json:"oldest_queued,omitzero"`
|
|
}
|
|
|
|
// Passthrough is the Schritt 2 stand-in: no lock, no queue. Every job runs
|
|
// concurrently. It exists so the server package can be written against the
|
|
// final interface from day one.
|
|
type Passthrough struct {
|
|
reg *registry.Registry
|
|
|
|
mu sync.Mutex
|
|
inFlight int
|
|
total int64
|
|
lastRunMS int64
|
|
}
|
|
|
|
// NewPassthrough returns a Scheduler that runs every job immediately.
|
|
func NewPassthrough(reg *registry.Registry) *Passthrough {
|
|
return &Passthrough{reg: reg}
|
|
}
|
|
|
|
// Run executes fn straight away, only tracking in-flight count for stats.
|
|
func (p *Passthrough) Run(ctx context.Context, consumer string, fn Job) error {
|
|
release := p.reg.MarkActive(consumer)
|
|
defer release()
|
|
|
|
p.mu.Lock()
|
|
p.inFlight++
|
|
p.total++
|
|
p.mu.Unlock()
|
|
defer func() {
|
|
p.mu.Lock()
|
|
p.inFlight--
|
|
p.mu.Unlock()
|
|
}()
|
|
|
|
start := time.Now()
|
|
err := fn(ctx)
|
|
elapsed := time.Since(start).Milliseconds()
|
|
p.mu.Lock()
|
|
p.lastRunMS = elapsed
|
|
p.mu.Unlock()
|
|
return err
|
|
}
|
|
|
|
// Stats returns current passthrough statistics.
|
|
func (p *Passthrough) Stats() Stats {
|
|
p.mu.Lock()
|
|
defer p.mu.Unlock()
|
|
return Stats{
|
|
InFlight: p.inFlight,
|
|
TotalJobs: p.total,
|
|
LastRunMS: p.lastRunMS,
|
|
}
|
|
}
|
|
|
|
// Compile-time interface guard.
|
|
var _ Scheduler = (*Passthrough)(nil)
|
|
|
|
// Ensure config package is imported (used by later Schritte that read
|
|
// per-consumer max_concurrency and vram_resident_mib).
|
|
var _ = config.KindTTS
|