paliad/.gitea/workflows/test.yaml

# Paliad CI gate (t-paliad-282 / m/paliad#114).
#
# Single workflow, two purposes:
#
#   - On every push: gate tier — build + unit + migration smoke. Red gate
#     means no further work and (on main) no deploy.
#   - On push to main with gate green: deploy step — calls the Dokploy
#     compose-deploy API for paliad's compose Zx147ycurfYagKRl_Zzyo, then
#     polls /health/ready until the new container reports 200.
#
# The deploy step REPLACES the previous Gitea-push → Dokploy webhook path
# (per m's Q11.4 pick: soft-launch with both alive for ~1 week, then
# disable the Dokploy auto-deploy toggle). Soft-launch leaves Dokploy's
# autoDeploy=true intact today — the workflow's deploy step is additive
# and idempotent (Dokploy's deploy is itself idempotent).
#
# Catches the three failure classes from 2026-05-25:
#
#   - brunel slot collision (~13:20) — TestMigrations_NoDuplicateSlot,
#     pure unit, no DB needed.
#   - hermes dropped-col refs (~16:05) — TestBootSmoke, applies all NEW
#     migrations (those not in the snapshot) end-to-end against a
#     scratch DB restored from internal/db/testdata/prod-snapshot.sql.
#   - mig 129 42501 ownership (~14:56→) — TestMigrations_EndToEndAsAppRole,
#     applies new migrations as the prod-shaped `postgres` role (which
#     is NOT a superuser on supabase/postgres — same shape as
#     youpc-supabase prod, see internal/db/testdata/README.md).
#
# Snapshot approach: dump paliad schema + applied_migrations rows from
# prod, commit them. CI restores → ApplyMigrations sees existing migs as
# applied, only runs NEW migs (the ones this PR adds). This sidesteps the
# fresh-DB idempotence requirement on historical migrations (some of
# which use raw COMMIT or pre-installed extensions and can't be replayed
# from scratch). To refresh: `make refresh-snapshot`.
#
# Design: docs/design-cicd-pre-deploy-gate-2026-05-25.md (cronus inventor
# shift, t-paliad-282).

name: Paliad CI gate

on:
  push:
    branches:
      - main
      - 'mai/**'
  pull_request:
    branches: [main]

env:
  GO_VERSION: '1.24'
  BUN_VERSION: '1.2'

jobs:
  # Gate job 1 — pure build. Catches go/bun build breakage that local
  # `go build` would catch but which a worker might have skipped before
  # pushing. Fast (~60 s) so a red here surfaces immediately.
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: ${{ env.GO_VERSION }}
          cache: true

      - name: go build
        run: go build ./...

      - name: go vet
        run: go vet ./...

      - name: Set up Bun
        uses: oven-sh/setup-bun@v2
        with:
          bun-version: ${{ env.BUN_VERSION }}

      - name: bun install + build
        working-directory: frontend
        run: |
          bun install --frozen-lockfile
          bun run build

  # Gate job 2 — Go test suite + migration smoke against snapshot-restored
  # scratch DB.
  #
  # The Postgres service container uses the same supabase/postgres image
  # as youpc-supabase prod. The CI scratch DB starts empty; a setup step
  # installs pg_trgm + restores the snapshot. After restore, paliad
  # schema is at HEAD-of-snapshot and applied_migrations covers every
  # migration up to (and including) the snapshot's max version.
  #
  # ApplyMigrations called in TestBootSmoke / TestMigrations_EndToEndAsAppRole
  # sees the snapshot's applied set, finds whatever NEW migrations this
  # PR added on top, and applies only those. The role-split smoke runs as
  # `postgres` (which is NOT a superuser on supabase/postgres, matching
  # the prod role topology) — any new migration that needs supabase_admin
  # privilege fails here as it would in prod.
  test-go:
    runs-on: ubuntu-latest

    services:
      # supabase/postgres baked-in auth schema + supabase role topology
      # matches youpc-supabase prod. `postgres` here is NOT a superuser
      # (verified live: \du postgres shows "Create role, Create DB,
      # Replication, Bypass RLS" — no Superuser). This is the prod-shaped
      # role the deploy uses.
      postgres:
        image: supabase/postgres:15.8.1.060
        env:
          POSTGRES_PASSWORD: ci
          POSTGRES_DB: paliad_scratch
        ports:
          - 5432:5432
        options: >-
          --health-cmd "pg_isready -U postgres"
          --health-interval 5s
          --health-timeout 5s
          --health-retries 30

    steps:
      - uses: actions/checkout@v4

      - name: Set up Go
        uses: actions/setup-go@v5
        with:
          go-version: ${{ env.GO_VERSION }}
          cache: true

      - name: Install postgresql-client
        run: |
          apt-get update -qq && apt-get install -y -qq postgresql-client

      # Snapshot restore. Two prep steps as supabase_admin (the actual
      # superuser): GRANT CREATE so the `postgres` role can later create
      # schemas if a new mig needs it; install pg_trgm so the snapshot's
      # trigram indexes restore. Snapshot itself loads as `postgres`.
      - name: Provision + restore snapshot
        env:
          PGPASSWORD: ci
        run: |
          set -euo pipefail
          psql -h localhost -U supabase_admin -d paliad_scratch -v ON_ERROR_STOP=1 \
            -c "GRANT CREATE ON DATABASE paliad_scratch TO postgres;" \
            -c "CREATE EXTENSION IF NOT EXISTS pg_trgm;"
          psql -h localhost -U postgres -d paliad_scratch -v ON_ERROR_STOP=1 \
            -f internal/db/testdata/prod-snapshot.sql

      # Pre-flight: catches brunel slot collision in seconds, no DB
      # contact (still useful even though the test-go job has Postgres
      # running, because the failure mode is independent).
      - name: Migration coordination check
        run: go test -count=1 -run TestMigrations_NoDuplicateSlot ./internal/db/

      # Role-split end-to-end apply. Connects as `postgres` (NOT a
      # superuser on supabase/postgres) and runs ApplyMigrations against
      # the snapshot-restored DB. Existing migs are skipped (already in
      # applied_migrations); NEW migs in this PR apply here. If a new
      # migration assumes supabase_admin privilege, fails with the same
      # 42501 error class that took paliad.de offline on 2026-05-25.
      - name: Migration end-to-end (deploy role)
        env:
          TEST_APP_DATABASE_URL: postgres://postgres:ci@localhost:5432/paliad_scratch?sslmode=disable
        run: go test -count=1 -run TestMigrations_EndToEndAsAppRole ./internal/db/

      # Boot smoke. Confirms ApplyMigrations succeeds + applied set
      # matches on-disk set + /healthz returns 200 + /health/ready
      # returns 200 (the live-pool variant via TestHealthReady_Live).
      - name: Boot smoke + readiness
        env:
          TEST_DATABASE_URL: postgres://postgres:ci@localhost:5432/paliad_scratch?sslmode=disable
        run: go test -count=1 -run 'TestBootSmoke|TestHealthReady_Live' ./cmd/server/

      # Full Go test suite WITHOUT TEST_DATABASE_URL so live-DB service
      # tests skip (same shape as a developer laptop without a scratch
      # DB). Live-DB tests in internal/services/* will be activated by a
      # follow-up shift once the snapshot is verified stable across
      # multiple PRs — they need investigation against supabase/postgres
      # 15.8 (parameter type inference differs subtly from youpc-supabase).
      - name: go test ./... (pure + skip-on-no-DB)
        run: go test -count=1 ./internal/... ./cmd/...

  # Deploy step. Only runs on push to main and only after both gate jobs
  # are green. Calls Dokploy's compose.deploy with the paliad compose ID
  # (Zx147ycurfYagKRl_Zzyo) and polls /health/ready until it returns 200
  # or times out.
  #
  # Skipped on PR / feature branch pushes — those run the gate tier as
  # a status check but don't trigger a prod deploy. Dokploy's existing
  # autoDeploy=true webhook continues to fire during the soft-launch
  # window (per Q11.4); it can be disabled in the Dokploy UI once this
  # workflow has gated ≥5 successful green deploys.
  deploy:
    runs-on: ubuntu-latest
    needs: [build, test-go]
    if: github.ref == 'refs/heads/main' && github.event_name == 'push'

    steps:
      - name: Trigger Dokploy compose deploy
        env:
          DOKPLOY_KEY: ${{ secrets.DOKPLOY_TOKEN }}
          DOKPLOY_API: http://100.99.98.201:3000/api/trpc
          COMPOSE_ID: Zx147ycurfYagKRl_Zzyo
        run: |
          set -euo pipefail
          if [ -z "${DOKPLOY_KEY:-}" ]; then
            echo "ERROR: DOKPLOY_TOKEN secret is not configured."
            echo "       Set the secret in Gitea repo settings before this step can deploy."
            exit 2
          fi
          echo "==> POST compose.deploy"
          curl -sS --connect-timeout 5 --max-time 30 \
            -X POST \
            -H "x-api-key: $DOKPLOY_KEY" \
            -H "Content-Type: application/json" \
            -d "{\"json\":{\"composeId\":\"$COMPOSE_ID\"}}" \
            "$DOKPLOY_API/compose.deploy"
          echo

      - name: Wait for /health/ready
        run: |
          set -euo pipefail
          echo "==> polling https://paliad.de/health/ready"
          # Up to 5 minutes (60 × 5 s) — paliad's cold-start is normally
          # ≤30 s; the longer budget covers slow image pulls + migration
          # apply.
          for i in $(seq 1 60); do
            status=$(curl -sS --connect-timeout 3 --max-time 5 \
              -o /dev/null -w '%{http_code}' \
              https://paliad.de/health/ready || echo "000")
            if [ "$status" = "200" ]; then
              echo "ready after ${i} poll(s)"
              exit 0
            fi
            echo "  [$i/60] status=$status — sleeping 5s"
            sleep 5
          done
          echo "ERROR: /health/ready did not return 200 within 5 minutes."
          echo "       The deploy fired but the new container is not serving."
          echo "       Investigate: ssh mlake 'docker logs --tail 50 compose-transmit-multi-byte-driver-v7jth9-web-1'"
          exit 1