Go daemon listening on :8770 that fronts mvoice (8766), whisper-server
(8178), ollama (11434), comfyui (8188) behind a single /v1 façade.
What this MVP does:
- Loads config/consumers.yaml: routing table, per-consumer URL + health +
paths + vram_resident_mib + can_coexist_with + load/unload routes.
- Background health probe (5s) on every consumer; refuses fast with a
structured 503 if the last probe failed (no Felix-Banholzer-style
silent fallback).
- POST /v1/{tts,stt,llm,image} proxies the request body + Content-Type
to the routed consumer's path and streams the response back.
- GET /audio/* proxies to audio_proxy consumer (wa.sh fetches its WAV
this way).
- GET /v1/status exposes live GPU sample (nvidia-smi every 2s),
per-consumer health/loaded/gpu_resident_mib/active/total_requests,
scheduler stats.
- GET /healthz, GET / — broker liveness.
The Scheduler interface is in place but the implementation is
'Passthrough' — every job runs immediately, no lock, no queue. Schritt 4
replaces it with a serialising mutex; Schritt 5 adds VRAM-pressure
eviction. The interface boundary means server.go stays unchanged.
Out of scope here:
- Schritt 3: wa.sh migration (parallel work in mAi).
- Schritt 4: queue + global GPU lock.
- Schritt 5: nvidia-smi-driven LRU eviction.
Tests: config validation (good/bad), proxy forwards body, audio proxy
streams bytes, unhealthy consumer returns 503, /v1/status JSON shape.
Refs: m/mGPUmanager#1
378 lines
12 KiB
Go
378 lines
12 KiB
Go
// Package server is the HTTP façade of mGPUmanager.
|
|
//
|
|
// It exposes:
|
|
// - POST /v1/tts, /v1/stt, /v1/llm, /v1/image — pass-through proxy to the
|
|
// consumer named in config.Routing[kind].
|
|
// - GET /audio/* — proxy to config.AudioProxy (mvoice's audio directory).
|
|
// - GET /v1/status — live snapshot of consumers + GPU + scheduler.
|
|
// - GET /healthz — broker liveness (200 if process is up).
|
|
//
|
|
// Every proxy call goes through the Scheduler so that, in Schritt 4 and 5,
|
|
// queueing and eviction can be added without touching server.go.
|
|
package server
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"net/http"
|
|
"net/url"
|
|
"strings"
|
|
"time"
|
|
|
|
"mgit.msbls.de/m/mGPUmanager/internal/config"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/gpu"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/registry"
|
|
"mgit.msbls.de/m/mGPUmanager/internal/scheduler"
|
|
)
|
|
|
|
// Server bundles the HTTP handlers + dependencies.
|
|
type Server struct {
|
|
cfg *config.Config
|
|
reg *registry.Registry
|
|
gpu *gpu.Poller
|
|
sched scheduler.Scheduler
|
|
client *http.Client
|
|
logger *slog.Logger
|
|
}
|
|
|
|
// New builds a Server. Caller owns the lifecycle of reg/gpu/sched.
|
|
func New(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poller, sched scheduler.Scheduler, logger *slog.Logger) *Server {
|
|
return &Server{
|
|
cfg: cfg,
|
|
reg: reg,
|
|
gpu: gpuPoller,
|
|
sched: sched,
|
|
client: &http.Client{Timeout: 120 * time.Second}, // TTS can take 5-10s; image gen up to 60s
|
|
logger: logger,
|
|
}
|
|
}
|
|
|
|
// Handler returns the root mux. Caller wraps it in http.Server.
|
|
func (s *Server) Handler() http.Handler {
|
|
mux := http.NewServeMux()
|
|
|
|
mux.HandleFunc("POST /v1/tts", s.handleEndpoint(config.KindTTS))
|
|
mux.HandleFunc("POST /v1/stt", s.handleEndpoint(config.KindSTT))
|
|
mux.HandleFunc("POST /v1/llm", s.handleEndpoint(config.KindLLM))
|
|
mux.HandleFunc("POST /v1/image", s.handleEndpoint(config.KindImage))
|
|
|
|
mux.HandleFunc("GET /audio/", s.handleAudio)
|
|
mux.HandleFunc("GET /v1/status", s.handleStatus)
|
|
mux.HandleFunc("GET /healthz", s.handleHealthz)
|
|
mux.HandleFunc("GET /", s.handleRoot)
|
|
|
|
return logMiddleware(s.logger, mux)
|
|
}
|
|
|
|
// ───── error envelope ─────────────────────────────────────────────────────
|
|
|
|
// errorBody is the broker's structured error envelope. Every non-2xx response
|
|
// from mGPUmanager itself uses this shape. (Pass-through 4xx/5xx from
|
|
// consumers are forwarded verbatim so callers see the original payload.)
|
|
type errorBody struct {
|
|
Error string `json:"error"`
|
|
Message string `json:"message"`
|
|
Consumer string `json:"consumer,omitempty"`
|
|
Retryable bool `json:"retryable"`
|
|
}
|
|
|
|
func writeErr(w http.ResponseWriter, status int, code, msg, consumer string, retryable bool) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
w.WriteHeader(status)
|
|
_ = json.NewEncoder(w).Encode(errorBody{
|
|
Error: code,
|
|
Message: msg,
|
|
Consumer: consumer,
|
|
Retryable: retryable,
|
|
})
|
|
}
|
|
|
|
// ───── endpoint proxy ─────────────────────────────────────────────────────
|
|
|
|
// handleEndpoint returns the http.HandlerFunc for a /v1/<kind> endpoint.
|
|
func (s *Server) handleEndpoint(kind config.EndpointKind) http.HandlerFunc {
|
|
return func(w http.ResponseWriter, r *http.Request) {
|
|
consName, cons := s.cfg.ConsumerForKind(kind)
|
|
if cons == nil {
|
|
writeErr(w, http.StatusNotImplemented, "no_consumer",
|
|
fmt.Sprintf("no consumer routes %s", kind), "", false)
|
|
return
|
|
}
|
|
route, ok := cons.Paths[kind]
|
|
if !ok {
|
|
writeErr(w, http.StatusNotImplemented, "no_consumer",
|
|
fmt.Sprintf("consumer %s lacks paths.%s", consName, kind), consName, false)
|
|
return
|
|
}
|
|
|
|
// Refuse fast if the consumer is unhealthy (last probe failed) — keeps
|
|
// Felix-Banholzer-style silent-fallback impossible.
|
|
st := s.reg.Get(consName)
|
|
if !st.Healthy && !st.LastProbe.IsZero() {
|
|
writeErr(w, http.StatusServiceUnavailable, "consumer_unreachable",
|
|
fmt.Sprintf("consumer %s last probe failed: %s", consName, st.LastError),
|
|
consName, true)
|
|
return
|
|
}
|
|
|
|
err := s.sched.Run(r.Context(), consName, func(ctx context.Context) error {
|
|
return s.proxyRequest(ctx, w, r, cons, route, consName)
|
|
})
|
|
if err != nil && !responseStarted(w) {
|
|
writeErr(w, http.StatusInternalServerError, "scheduler_error",
|
|
err.Error(), consName, true)
|
|
}
|
|
}
|
|
}
|
|
|
|
// proxyRequest forwards the inbound HTTP request to a consumer route and
|
|
// streams the response back. Errors before the consumer responds are surfaced
|
|
// as the broker's structured error envelope; once the consumer has begun
|
|
// responding we stream its bytes through unchanged.
|
|
func (s *Server) proxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, cons *config.Consumer, route config.Route, consumer string) error {
|
|
target, err := url.Parse(cons.URL)
|
|
if err != nil {
|
|
writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
|
|
err.Error(), consumer, false)
|
|
return nil
|
|
}
|
|
target.Path = route.Path
|
|
// Forward inbound query string verbatim.
|
|
target.RawQuery = r.URL.RawQuery
|
|
|
|
method := route.Method
|
|
if method == "" {
|
|
method = r.Method
|
|
}
|
|
|
|
upstream, err := http.NewRequestWithContext(ctx, method, target.String(), r.Body)
|
|
if err != nil {
|
|
writeErr(w, http.StatusInternalServerError, "bad_request",
|
|
err.Error(), consumer, false)
|
|
return nil
|
|
}
|
|
// Copy through Content-Type, Content-Length and Accept (don't carry Host).
|
|
for _, h := range []string{"Content-Type", "Content-Length", "Accept", "Accept-Encoding"} {
|
|
if v := r.Header.Get(h); v != "" {
|
|
upstream.Header.Set(h, v)
|
|
}
|
|
}
|
|
|
|
resp, err := s.client.Do(upstream)
|
|
if err != nil {
|
|
writeErr(w, http.StatusBadGateway, "consumer_unreachable",
|
|
fmt.Sprintf("upstream %s: %v", target.Host, err), consumer, true)
|
|
return nil
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
// Stream response.
|
|
for k, vs := range resp.Header {
|
|
if strings.EqualFold(k, "Connection") || strings.EqualFold(k, "Transfer-Encoding") {
|
|
continue
|
|
}
|
|
for _, v := range vs {
|
|
w.Header().Add(k, v)
|
|
}
|
|
}
|
|
w.WriteHeader(resp.StatusCode)
|
|
_, _ = io.Copy(w, resp.Body)
|
|
return nil
|
|
}
|
|
|
|
// ───── audio proxy ────────────────────────────────────────────────────────
|
|
|
|
// handleAudio forwards GET /audio/<file> to the audio_proxy consumer (mvoice).
|
|
// wa.sh fetches the rendered .wav via this path after /v1/tts returns its URL.
|
|
func (s *Server) handleAudio(w http.ResponseWriter, r *http.Request) {
|
|
if s.cfg.AudioProxy == "" {
|
|
writeErr(w, http.StatusNotFound, "no_audio_proxy",
|
|
"audio_proxy is not configured", "", false)
|
|
return
|
|
}
|
|
cons, ok := s.cfg.Consumers[s.cfg.AudioProxy]
|
|
if !ok {
|
|
writeErr(w, http.StatusInternalServerError, "no_audio_proxy",
|
|
"audio_proxy points at unknown consumer", s.cfg.AudioProxy, false)
|
|
return
|
|
}
|
|
target, err := url.Parse(cons.URL)
|
|
if err != nil {
|
|
writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
|
|
err.Error(), s.cfg.AudioProxy, false)
|
|
return
|
|
}
|
|
target.Path = r.URL.Path
|
|
target.RawQuery = r.URL.RawQuery
|
|
|
|
upstream, err := http.NewRequestWithContext(r.Context(), http.MethodGet, target.String(), nil)
|
|
if err != nil {
|
|
writeErr(w, http.StatusInternalServerError, "bad_request",
|
|
err.Error(), s.cfg.AudioProxy, false)
|
|
return
|
|
}
|
|
resp, err := s.client.Do(upstream)
|
|
if err != nil {
|
|
writeErr(w, http.StatusBadGateway, "consumer_unreachable",
|
|
fmt.Sprintf("upstream %s: %v", target.Host, err), s.cfg.AudioProxy, true)
|
|
return
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
for k, vs := range resp.Header {
|
|
for _, v := range vs {
|
|
w.Header().Add(k, v)
|
|
}
|
|
}
|
|
w.WriteHeader(resp.StatusCode)
|
|
_, _ = io.Copy(w, resp.Body)
|
|
}
|
|
|
|
// ───── status ─────────────────────────────────────────────────────────────
|
|
|
|
type statusResponse struct {
|
|
Listen string `json:"listen"`
|
|
Time time.Time `json:"time"`
|
|
GPU statusGPU `json:"gpu"`
|
|
Routing map[config.EndpointKind]string `json:"routing"`
|
|
Consumers []statusConsumer `json:"consumers"`
|
|
Scheduler scheduler.Stats `json:"scheduler"`
|
|
}
|
|
|
|
type statusGPU struct {
|
|
TotalMiB int `json:"total_mib"`
|
|
UsedMiB int `json:"used_mib"`
|
|
FreeMiB int `json:"free_mib"`
|
|
ReservedMiB int `json:"reserved_mib"`
|
|
LastSample time.Time `json:"last_sample"`
|
|
Err string `json:"err,omitempty"`
|
|
}
|
|
|
|
type statusConsumer struct {
|
|
Name string `json:"name"`
|
|
URL string `json:"url"`
|
|
Healthy bool `json:"healthy"`
|
|
Loaded bool `json:"loaded"`
|
|
GPUResidentMiB int `json:"gpu_resident_mib"`
|
|
VRAMBudgetMiB int `json:"vram_budget_mib"`
|
|
Active int `json:"active"`
|
|
TotalRequests int64 `json:"total_requests"`
|
|
LastUsed time.Time `json:"last_used,omitzero"`
|
|
LastProbe time.Time `json:"last_probe,omitzero"`
|
|
LastError string `json:"last_error,omitempty"`
|
|
Priority int `json:"priority"`
|
|
CanCoexistWith []string `json:"can_coexist_with"`
|
|
}
|
|
|
|
func (s *Server) handleStatus(w http.ResponseWriter, r *http.Request) {
|
|
sample := s.gpu.Last()
|
|
snap := s.reg.Snapshot()
|
|
|
|
resp := statusResponse{
|
|
Listen: s.cfg.Listen,
|
|
Time: time.Now(),
|
|
Routing: s.cfg.Routing,
|
|
GPU: statusGPU{
|
|
TotalMiB: s.cfg.GPU.TotalMiB,
|
|
UsedMiB: sample.UsedMiB,
|
|
FreeMiB: sample.FreeMiB,
|
|
ReservedMiB: s.cfg.GPU.ReservedMiB,
|
|
LastSample: sample.At,
|
|
Err: sample.Err,
|
|
},
|
|
Scheduler: s.sched.Stats(),
|
|
}
|
|
if resp.GPU.TotalMiB == 0 && sample.TotalMiB > 0 {
|
|
resp.GPU.TotalMiB = sample.TotalMiB
|
|
}
|
|
|
|
// Stable ordering by config-declared name.
|
|
names := make([]string, 0, len(s.cfg.Consumers))
|
|
for n := range s.cfg.Consumers {
|
|
names = append(names, n)
|
|
}
|
|
sortStrings(names)
|
|
for _, n := range names {
|
|
cons := s.cfg.Consumers[n]
|
|
st := snap[n]
|
|
resp.Consumers = append(resp.Consumers, statusConsumer{
|
|
Name: n,
|
|
URL: cons.URL,
|
|
Healthy: st.Healthy,
|
|
Loaded: st.Loaded,
|
|
GPUResidentMiB: st.GPUResidentMiB,
|
|
VRAMBudgetMiB: cons.VRAMResidentMiB,
|
|
Active: st.Active,
|
|
TotalRequests: st.TotalRequests,
|
|
LastUsed: st.LastUsed,
|
|
LastProbe: st.LastProbe,
|
|
LastError: st.LastError,
|
|
Priority: cons.Priority,
|
|
CanCoexistWith: cons.CanCoexistWith,
|
|
})
|
|
}
|
|
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_ = json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
func (s *Server) handleHealthz(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "application/json")
|
|
_, _ = w.Write([]byte(`{"status":"ok"}`))
|
|
}
|
|
|
|
func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
|
|
w.Header().Set("Content-Type", "text/plain")
|
|
_, _ = io.Copy(w, bytes.NewReader([]byte(
|
|
"mGPUmanager — see GET /v1/status for live state, POST /v1/{tts,stt,llm,image} for inference\n",
|
|
)))
|
|
}
|
|
|
|
// ───── helpers ────────────────────────────────────────────────────────────
|
|
|
|
// responseStarted is a coarse heuristic: once we've written headers, we can't
|
|
// switch to the error envelope. The proxy path writes headers only inside
|
|
// proxyRequest, which catches its own errors before that point.
|
|
func responseStarted(_ http.ResponseWriter) bool { return false }
|
|
|
|
// sortStrings: avoid pulling in "sort" everywhere this file uses ordering.
|
|
func sortStrings(s []string) {
|
|
for i := 1; i < len(s); i++ {
|
|
for j := i; j > 0 && s[j-1] > s[j]; j-- {
|
|
s[j-1], s[j] = s[j], s[j-1]
|
|
}
|
|
}
|
|
}
|
|
|
|
// logMiddleware emits one structured request log per call.
|
|
func logMiddleware(logger *slog.Logger, next http.Handler) http.Handler {
|
|
if logger == nil {
|
|
return next
|
|
}
|
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
|
start := time.Now()
|
|
lw := &statusCapture{ResponseWriter: w, code: 200}
|
|
next.ServeHTTP(lw, r)
|
|
logger.Info("http",
|
|
"method", r.Method,
|
|
"path", r.URL.Path,
|
|
"status", lw.code,
|
|
"ms", time.Since(start).Milliseconds(),
|
|
)
|
|
})
|
|
}
|
|
|
|
type statusCapture struct {
|
|
http.ResponseWriter
|
|
code int
|
|
}
|
|
|
|
func (s *statusCapture) WriteHeader(code int) {
|
|
s.code = code
|
|
s.ResponseWriter.WriteHeader(code)
|
|
}
|