mGPUmanager/internal/server/server.go

// Package server is the HTTP façade of mGPUmanager.
//
// It exposes:
//   - POST /v1/tts, /v1/stt, /v1/llm, /v1/image — pass-through proxy to the
//     consumer named in config.Routing[kind].
//   - GET  /audio/*  — proxy to config.AudioProxy (mvoice's audio directory).
//   - GET  /v1/status — live snapshot of consumers + GPU + scheduler.
//   - GET  /healthz   — broker liveness (200 if process is up).
//
// Every proxy call goes through the Scheduler so that, in Schritt 4 and 5,
// queueing and eviction can be added without touching server.go.
package server

import (
	"bytes"
	"context"
	"encoding/json"
	"fmt"
	"io"
	"log/slog"
	"net/http"
	"net/url"
	"strings"
	"time"

	"mgit.msbls.de/m/mGPUmanager/internal/config"
	"mgit.msbls.de/m/mGPUmanager/internal/gpu"
	"mgit.msbls.de/m/mGPUmanager/internal/registry"
	"mgit.msbls.de/m/mGPUmanager/internal/scheduler"
)

// Server bundles the HTTP handlers + dependencies.
type Server struct {
	cfg    *config.Config
	reg    *registry.Registry
	gpu    *gpu.Poller
	sched  scheduler.Scheduler
	client *http.Client
	logger *slog.Logger
}

// New builds a Server. Caller owns the lifecycle of reg/gpu/sched.
func New(cfg *config.Config, reg *registry.Registry, gpuPoller *gpu.Poller, sched scheduler.Scheduler, logger *slog.Logger) *Server {
	return &Server{
		cfg:    cfg,
		reg:    reg,
		gpu:    gpuPoller,
		sched:  sched,
		client: &http.Client{Timeout: 120 * time.Second}, // TTS can take 5-10s; image gen up to 60s
		logger: logger,
	}
}

// Handler returns the root mux. Caller wraps it in http.Server.
func (s *Server) Handler() http.Handler {
	mux := http.NewServeMux()

	mux.HandleFunc("POST /v1/tts", s.handleEndpoint(config.KindTTS))
	mux.HandleFunc("POST /v1/stt", s.handleEndpoint(config.KindSTT))
	mux.HandleFunc("POST /v1/llm", s.handleEndpoint(config.KindLLM))
	mux.HandleFunc("POST /v1/image", s.handleEndpoint(config.KindImage))

	mux.HandleFunc("GET /audio/", s.handleAudio)
	mux.HandleFunc("GET /v1/status", s.handleStatus)
	mux.HandleFunc("GET /healthz", s.handleHealthz)
	mux.HandleFunc("GET /", s.handleRoot)

	return logMiddleware(s.logger, mux)
}

// ───── error envelope ─────────────────────────────────────────────────────

// errorBody is the broker's structured error envelope. Every non-2xx response
// from mGPUmanager itself uses this shape. (Pass-through 4xx/5xx from
// consumers are forwarded verbatim so callers see the original payload.)
type errorBody struct {
	Error     string `json:"error"`
	Message   string `json:"message"`
	Consumer  string `json:"consumer,omitempty"`
	Retryable bool   `json:"retryable"`
}

func writeErr(w http.ResponseWriter, status int, code, msg, consumer string, retryable bool) {
	w.Header().Set("Content-Type", "application/json")
	w.WriteHeader(status)
	_ = json.NewEncoder(w).Encode(errorBody{
		Error:     code,
		Message:   msg,
		Consumer:  consumer,
		Retryable: retryable,
	})
}

// ───── endpoint proxy ─────────────────────────────────────────────────────

// handleEndpoint returns the http.HandlerFunc for a /v1/<kind> endpoint.
func (s *Server) handleEndpoint(kind config.EndpointKind) http.HandlerFunc {
	return func(w http.ResponseWriter, r *http.Request) {
		consName, cons := s.cfg.ConsumerForKind(kind)
		if cons == nil {
			writeErr(w, http.StatusNotImplemented, "no_consumer",
				fmt.Sprintf("no consumer routes %s", kind), "", false)
			return
		}
		route, ok := cons.Paths[kind]
		if !ok {
			writeErr(w, http.StatusNotImplemented, "no_consumer",
				fmt.Sprintf("consumer %s lacks paths.%s", consName, kind), consName, false)
			return
		}

		// Refuse fast if the consumer is unhealthy (last probe failed) — keeps
		// Felix-Banholzer-style silent-fallback impossible.
		st := s.reg.Get(consName)
		if !st.Healthy && !st.LastProbe.IsZero() {
			writeErr(w, http.StatusServiceUnavailable, "consumer_unreachable",
				fmt.Sprintf("consumer %s last probe failed: %s", consName, st.LastError),
				consName, true)
			return
		}

		err := s.sched.Run(r.Context(), consName, func(ctx context.Context) error {
			return s.proxyRequest(ctx, w, r, cons, route, consName)
		})
		if err != nil && !responseStarted(w) {
			writeErr(w, http.StatusInternalServerError, "scheduler_error",
				err.Error(), consName, true)
		}
	}
}

// proxyRequest forwards the inbound HTTP request to a consumer route and
// streams the response back. Errors before the consumer responds are surfaced
// as the broker's structured error envelope; once the consumer has begun
// responding we stream its bytes through unchanged.
func (s *Server) proxyRequest(ctx context.Context, w http.ResponseWriter, r *http.Request, cons *config.Consumer, route config.Route, consumer string) error {
	target, err := url.Parse(cons.URL)
	if err != nil {
		writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
			err.Error(), consumer, false)
		return nil
	}
	target.Path = route.Path
	// Forward inbound query string verbatim.
	target.RawQuery = r.URL.RawQuery

	method := route.Method
	if method == "" {
		method = r.Method
	}

	upstream, err := http.NewRequestWithContext(ctx, method, target.String(), r.Body)
	if err != nil {
		writeErr(w, http.StatusInternalServerError, "bad_request",
			err.Error(), consumer, false)
		return nil
	}
	// Copy through Content-Type, Content-Length and Accept (don't carry Host).
	for _, h := range []string{"Content-Type", "Content-Length", "Accept", "Accept-Encoding"} {
		if v := r.Header.Get(h); v != "" {
			upstream.Header.Set(h, v)
		}
	}

	resp, err := s.client.Do(upstream)
	if err != nil {
		writeErr(w, http.StatusBadGateway, "consumer_unreachable",
			fmt.Sprintf("upstream %s: %v", target.Host, err), consumer, true)
		return nil
	}
	defer resp.Body.Close()

	// Stream response.
	for k, vs := range resp.Header {
		if strings.EqualFold(k, "Connection") || strings.EqualFold(k, "Transfer-Encoding") {
			continue
		}
		for _, v := range vs {
			w.Header().Add(k, v)
		}
	}
	w.WriteHeader(resp.StatusCode)
	_, _ = io.Copy(w, resp.Body)
	return nil
}

// ───── audio proxy ────────────────────────────────────────────────────────

// handleAudio forwards GET /audio/<file> to the audio_proxy consumer (mvoice).
// wa.sh fetches the rendered .wav via this path after /v1/tts returns its URL.
func (s *Server) handleAudio(w http.ResponseWriter, r *http.Request) {
	if s.cfg.AudioProxy == "" {
		writeErr(w, http.StatusNotFound, "no_audio_proxy",
			"audio_proxy is not configured", "", false)
		return
	}
	cons, ok := s.cfg.Consumers[s.cfg.AudioProxy]
	if !ok {
		writeErr(w, http.StatusInternalServerError, "no_audio_proxy",
			"audio_proxy points at unknown consumer", s.cfg.AudioProxy, false)
		return
	}
	target, err := url.Parse(cons.URL)
	if err != nil {
		writeErr(w, http.StatusInternalServerError, "bad_consumer_url",
			err.Error(), s.cfg.AudioProxy, false)
		return
	}
	target.Path = r.URL.Path
	target.RawQuery = r.URL.RawQuery

	upstream, err := http.NewRequestWithContext(r.Context(), http.MethodGet, target.String(), nil)
	if err != nil {
		writeErr(w, http.StatusInternalServerError, "bad_request",
			err.Error(), s.cfg.AudioProxy, false)
		return
	}
	resp, err := s.client.Do(upstream)
	if err != nil {
		writeErr(w, http.StatusBadGateway, "consumer_unreachable",
			fmt.Sprintf("upstream %s: %v", target.Host, err), s.cfg.AudioProxy, true)
		return
	}
	defer resp.Body.Close()

	for k, vs := range resp.Header {
		for _, v := range vs {
			w.Header().Add(k, v)
		}
	}
	w.WriteHeader(resp.StatusCode)
	_, _ = io.Copy(w, resp.Body)
}

// ───── status ─────────────────────────────────────────────────────────────

type statusResponse struct {
	Listen    string                       `json:"listen"`
	Time      time.Time                    `json:"time"`
	GPU       statusGPU                    `json:"gpu"`
	Routing   map[config.EndpointKind]string `json:"routing"`
	Consumers []statusConsumer             `json:"consumers"`
	Scheduler scheduler.Stats              `json:"scheduler"`
}

type statusGPU struct {
	TotalMiB    int       `json:"total_mib"`
	UsedMiB     int       `json:"used_mib"`
	FreeMiB     int       `json:"free_mib"`
	ReservedMiB int       `json:"reserved_mib"`
	LastSample  time.Time `json:"last_sample"`
	Err         string    `json:"err,omitempty"`
}

type statusConsumer struct {
	Name            string    `json:"name"`
	URL             string    `json:"url"`
	Healthy         bool      `json:"healthy"`
	Loaded          bool      `json:"loaded"`
	GPUResidentMiB  int       `json:"gpu_resident_mib"`
	VRAMBudgetMiB   int       `json:"vram_budget_mib"`
	Active          int       `json:"active"`
	TotalRequests   int64     `json:"total_requests"`
	LastUsed        time.Time `json:"last_used,omitzero"`
	LastProbe       time.Time `json:"last_probe,omitzero"`
	LastError       string    `json:"last_error,omitempty"`
	Priority        int       `json:"priority"`
	CanCoexistWith  []string  `json:"can_coexist_with"`
}

func (s *Server) handleStatus(w http.ResponseWriter, r *http.Request) {
	sample := s.gpu.Last()
	snap := s.reg.Snapshot()

	resp := statusResponse{
		Listen:  s.cfg.Listen,
		Time:    time.Now(),
		Routing: s.cfg.Routing,
		GPU: statusGPU{
			TotalMiB:    s.cfg.GPU.TotalMiB,
			UsedMiB:     sample.UsedMiB,
			FreeMiB:     sample.FreeMiB,
			ReservedMiB: s.cfg.GPU.ReservedMiB,
			LastSample:  sample.At,
			Err:         sample.Err,
		},
		Scheduler: s.sched.Stats(),
	}
	if resp.GPU.TotalMiB == 0 && sample.TotalMiB > 0 {
		resp.GPU.TotalMiB = sample.TotalMiB
	}

	// Stable ordering by config-declared name.
	names := make([]string, 0, len(s.cfg.Consumers))
	for n := range s.cfg.Consumers {
		names = append(names, n)
	}
	sortStrings(names)
	for _, n := range names {
		cons := s.cfg.Consumers[n]
		st := snap[n]
		resp.Consumers = append(resp.Consumers, statusConsumer{
			Name:           n,
			URL:            cons.URL,
			Healthy:        st.Healthy,
			Loaded:         st.Loaded,
			GPUResidentMiB: st.GPUResidentMiB,
			VRAMBudgetMiB:  cons.VRAMResidentMiB,
			Active:         st.Active,
			TotalRequests:  st.TotalRequests,
			LastUsed:       st.LastUsed,
			LastProbe:      st.LastProbe,
			LastError:      st.LastError,
			Priority:       cons.Priority,
			CanCoexistWith: cons.CanCoexistWith,
		})
	}

	w.Header().Set("Content-Type", "application/json")
	_ = json.NewEncoder(w).Encode(resp)
}

func (s *Server) handleHealthz(w http.ResponseWriter, _ *http.Request) {
	w.Header().Set("Content-Type", "application/json")
	_, _ = w.Write([]byte(`{"status":"ok"}`))
}

func (s *Server) handleRoot(w http.ResponseWriter, _ *http.Request) {
	w.Header().Set("Content-Type", "text/plain")
	_, _ = io.Copy(w, bytes.NewReader([]byte(
		"mGPUmanager — see GET /v1/status for live state, POST /v1/{tts,stt,llm,image} for inference\n",
	)))
}

// ───── helpers ────────────────────────────────────────────────────────────

// responseStarted is a coarse heuristic: once we've written headers, we can't
// switch to the error envelope. The proxy path writes headers only inside
// proxyRequest, which catches its own errors before that point.
func responseStarted(_ http.ResponseWriter) bool { return false }

// sortStrings: avoid pulling in "sort" everywhere this file uses ordering.
func sortStrings(s []string) {
	for i := 1; i < len(s); i++ {
		for j := i; j > 0 && s[j-1] > s[j]; j-- {
			s[j-1], s[j] = s[j], s[j-1]
		}
	}
}

// logMiddleware emits one structured request log per call.
func logMiddleware(logger *slog.Logger, next http.Handler) http.Handler {
	if logger == nil {
		return next
	}
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		start := time.Now()
		lw := &statusCapture{ResponseWriter: w, code: 200}
		next.ServeHTTP(lw, r)
		logger.Info("http",
			"method", r.Method,
			"path", r.URL.Path,
			"status", lw.code,
			"ms", time.Since(start).Milliseconds(),
		)
	})
}

type statusCapture struct {
	http.ResponseWriter
	code int
}

func (s *statusCapture) WriteHeader(code int) {
	s.code = code
	s.ResponseWriter.WriteHeader(code)
}