Merge branch 'mai/artemis/issue-10-anti-ai-lint': Anti-AI-Lint im Build (#10)

2026-04-30 02:53:39 +02:00
parent d3a2bdce97 fdac496a6f
commit b12352473c
7 changed files with 547 additions and 6 deletions
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 FROM alpine:3.21 AS builder
-RUN apk add --no-cache bash yq coreutils findutils
+RUN apk add --no-cache bash yq coreutils findutils python3
 WORKDIR /src
 COPY . .
--- a/README.md
+++ b/README.md
@@ -31,10 +31,32 @@ build/           # Generated output (gitignored)
 ### Build
 ```bash
-./build.sh
+./build.sh              # build + anti-AI text lint
 ./build.sh --skip-lint  # build only (emergencies)
 ```
-Requires `yq` for YAML parsing. Outputs to `build/` directory.
+Requires `yq` for YAML parsing and `python3` for the lint step. Outputs to `build/`.
 ### Anti-AI text lint
 Every build runs `tools/anti-ai-lint.py` against `build/<domain>/index.html`,
 flagging text fingerprints typical of LLM-generated content (vocab and structure
 patterns from `tools/anti-ai-blacklist.yaml`). Severity `warn` prints a message;
 `fail` aborts the build.
 Whitelist a hit:
 - HTML comment in the affected page:
  `<!-- anti-ai-allow: revolutionär, em-dash-3-bullet -->`
 - Per-site override in `site.yaml`:
  ```yaml
  anti_ai_allow:
    - revolutionär
    - em-dash-3-bullet
  ```
 The blacklist source is `docs/geo-seo-guideline.md` §3.6. Test the linter with
 `tools/test-anti-ai-lint.sh`.
 ### Deploy
--- a/build.sh
+++ b/build.sh
@@ -5,6 +5,19 @@ set -euo pipefail
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 BUILD_DIR="$SCRIPT_DIR/build"
 skip_lint=0
 for arg in "$@"; do
    case "$arg" in
        --skip-lint) skip_lint=1 ;;
        -h|--help)
            echo "Usage: $0 [--skip-lint]"
            echo "  --skip-lint   Skip the anti-AI text lint step (emergencies only)."
            exit 0
            ;;
        *) echo "Unknown argument: $arg" >&2; exit 2 ;;
    esac
 done
 echo "=== Onepager Build ==="
 # Clean build directory
@@ -50,6 +63,14 @@ echo "[3/3] Copying shared assets..."
 cp -r "$SCRIPT_DIR/shared" "$BUILD_DIR/shared"
 echo "  -> shared/ copied"
-# 4. Report
+# 4. Anti-AI text lint
-echo "[4/4] Build complete"
+if [ "$skip_lint" -eq 1 ]; then
    echo "[4/4] Anti-AI lint skipped (--skip-lint)"
 elif ! command -v python3 >/dev/null 2>&1; then
    echo "[4/4] python3 not found — skipping anti-AI lint"
 else
    echo "[4/4] Anti-AI text lint..."
    python3 "$SCRIPT_DIR/tools/anti-ai-lint.py" "$BUILD_DIR"
 fi
 echo "=== Build complete: $count sites ==="
--- a/docs/geo-seo-guideline.md
+++ b/docs/geo-seo-guideline.md
@@ -248,7 +248,7 @@ Mehrere dieser Wörter im selben Absatz sind das stärkste Tell.
 **Praktische Umsetzung:**
- Lint-Skript im Build (`scripts/anti-ai-lint.sh`) das Vokabel-Blacklist über alle `index.html` und `*.md` läuft, mit Schwellenwert (z. B. mehr als 3 Marker pro 500 Wörter → Warnung).
+- Lint-Skript im Build: für onepager implementiert in `tools/anti-ai-lint.py` mit Vokabel-Blacklist in `tools/anti-ai-blacklist.yaml`. Läuft am Ende von `build.sh` über alle `build/<domain>/index.html`. Severity `warn` (Build geht durch) vs. `fail` (Build bricht ab). Whitelist via `<!-- anti-ai-allow: term -->`-Kommentar oder `anti_ai_allow:`-Liste in `site.yaml`. Notfall-Override: `build.sh --skip-lint`.
 - Bei AI-generierten Drafts: bewusst gegen die Blacklist redigieren.
 - Vor Veröffentlichung laut lesen. Wenn es klingt wie ein Pressemitteilungs-Bot, ist es einer.
--- a/tools/anti-ai-blacklist.yaml
+++ b/tools/anti-ai-blacklist.yaml
@@ -0,0 +1,97 @@
 # Anti-AI lint rules: textual fingerprints typical of LLM-generated content.
 #
 # Severity:
 #   warn — build proceeds, message printed
 #   fail — build aborts (exit 1) unless build.sh --skip-lint
 #
 # Whitelisting matches:
 #   In an HTML file:        <!-- anti-ai-allow: term -->
 #                            <!-- anti-ai-allow: term1, term2 -->
 #   Per site (site.yaml):    anti_ai_allow:
 #                              - leverage
 #                              - em-dash-3-bullet
 #
 # Vocab matches are case-insensitive substring matches against the visible
 # text of the rendered HTML (script/style/comments stripped). Pattern matches
 # are regex (Python re), case-insensitive by default, against the same.
 #
 # Source: docs/geo-seo-guideline.md §3.6 (Wikipedia AI-content signals).
 vocab:
  de:
    warn:
      - "nahtlos"
      - "robust"
      - "umfassend"
      - "ganzheitlich"
      - "fungiert als"
      - "dient als Brücke"
      - "Symbiose"
      - "im Bereich der"
      - "in der heutigen schnelllebigen"
      - "ein Meilenstein"
      - "ein Beweis für"
      - "hat Spuren hinterlassen"
      - "Es ist wichtig zu erwähnen"
      - "Es ist wichtig zu beachten"
      - "bahnbrechend"
      - "revolutionär"
    fail:
      - "in der sich entwickelnden Landschaft"
      - "Herausforderungen und Zukunftsaussichten"
      - "Herausforderungen und Perspektiven"
  en:
    warn:
      - "delve"
      - "tapestry"
      - "testament"
      - "intricate"
      - "garnered"
      - "bolstered"
      - "enduring"
      - "robust"
      - "comprehensive"
      - "meticulous"
      - "interplay"
      - "pivotal"
      - "underscore"
      - "moreover"
      - "furthermore"
      - "additionally"
      - "crucial"
      - "showcasing"
      - "highlighting"
      - "leverage"
      - "streamline"
      - "holistic"
      - "seamless"
      - "unleash"
      - "ecosystem"
      - "in the realm of"
      - "dive into"
      - "It's important to note that"
      - "It is important to note that"
      - "In this article, we'll"
    fail:
      - "in today's evolving landscape"
      - "in the ever-evolving landscape"
      - "Challenges and Future Prospects"
 patterns:
  - name: em-dash-3-bullet
    description: |
      Three "Word: text — Word: text — Word: …" segments in one block.
      Classic AI bullet pattern.
    regex: '(\w[\w\s]{0,30}:\s+[^—\n]{2,80}—\s*){2,}\w[\w\s]{0,30}:'
    severity: warn
  - name: not-only-but-also
    description: '"not only X, but also Y" / "nicht nur X, sondern auch Y" filler.'
    regex: '\b(?:not only|nicht nur)\b[^.,;\n]{1,80}\b(?:but also|sondern auch)\b'
    severity: warn
  - name: as-an-ai
    description: Leftover AI self-disclosure.
    regex: '\b(?:as an? (?:AI|language model)|als (?:eine?|eine\s+)?(?:KI|Sprachmodell))\b'
    severity: fail
--- a/tools/anti-ai-lint.py
+++ b/tools/anti-ai-lint.py
@@ -0,0 +1,294 @@
 #!/usr/bin/env python3
 """anti-ai-lint — flag AI-text fingerprints in built sites.
 Reads tools/anti-ai-blacklist.yaml, walks build/<domain>/index.html, prints
 findings. Exits 1 if any finding has severity=fail, else 0.
 Usage:
    tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet]
                          [--json] [BUILD_DIR]
 """
 import argparse
 import json
 import os
 import re
 import subprocess
 import sys
 from html.parser import HTMLParser
 def _ansi(code: str) -> str:
    return code if sys.stdout.isatty() else ""
 RED = _ansi("\033[31m")
 YELLOW = _ansi("\033[33m")
 GREEN = _ansi("\033[32m")
 DIM = _ansi("\033[2m")
 BOLD = _ansi("\033[1m")
 RESET = _ansi("\033[0m")
 class TextExtractor(HTMLParser):
    """Extract visible text and per-site allow directives."""
    SKIP_TAGS = {"script", "style", "noscript", "template"}
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.skip_depth = 0
        self.fragments: list[tuple[int, str]] = []
        self.allows: set[str] = set()
        self.html_lang: str | None = None
    def handle_starttag(self, tag: str, attrs) -> None:
        if tag == "html" and self.html_lang is None:
            for k, v in attrs:
                if k == "lang" and v:
                    self.html_lang = v.lower().split("-")[0]
                    break
        if tag in self.SKIP_TAGS:
            self.skip_depth += 1
    def handle_startendtag(self, tag: str, attrs) -> None:
        # Self-closing — never enters skip depth, no data either.
        pass
    def handle_endtag(self, tag: str) -> None:
        if tag in self.SKIP_TAGS and self.skip_depth > 0:
            self.skip_depth -= 1
    def handle_data(self, data: str) -> None:
        if self.skip_depth == 0 and data.strip():
            line, _ = self.getpos()
            self.fragments.append((line, data))
    def handle_comment(self, data: str) -> None:
        m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE)
        if m:
            for token in re.split(r"[,\s]+", m.group(1)):
                token = token.strip()
                if token:
                    self.allows.add(token)
                    self.allows.add(token.lower())
 def load_blacklist(path: str) -> dict:
    """Convert YAML to JSON via yq, parse with stdlib json."""
    try:
        out = subprocess.check_output(
            ["yq", "-o=json", path],
            stderr=subprocess.PIPE,
            text=True,
        )
    except FileNotFoundError:
        sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)")
    except subprocess.CalledProcessError as e:
        sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}")
    return json.loads(out)
 def site_allow_yaml(site_yaml: str) -> list[str]:
    if not os.path.isfile(site_yaml):
        return []
    try:
        out = subprocess.check_output(
            ["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml],
            stderr=subprocess.DEVNULL,
            text=True,
        )
    except subprocess.CalledProcessError:
        return []
    return [line.strip() for line in out.splitlines() if line.strip()]
 def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]:
    with open(html_path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()
    parser = TextExtractor()
    try:
        parser.feed(raw)
        parser.close()
    except Exception as e:
        # Malformed HTML — record a single warning and skip.
        return [{
            "kind": "parse",
            "name": "html-parse-error",
            "severity": "warn",
            "line": 0,
            "snippet": str(e)[:120],
        }]
    allow_set = set(parser.allows)
    for tok in extra_allows:
        allow_set.add(tok)
        allow_set.add(tok.lower())
    findings: list[dict] = []
    seen: set[tuple[str, str]] = set()  # (kind, name) — one report per file
    # Lint vocab in BOTH languages — sites may carry data-en attributes
    # that surface translated text alongside the primary language.
    vocab = blacklist.get("vocab") or {}
    for lang in ("de", "en"):
        bucket = vocab.get(lang) or {}
        for severity in ("warn", "fail"):
            for term in bucket.get(severity) or []:
                key = ("vocab", term.lower())
                if key in seen:
                    continue
                if term in allow_set or term.lower() in allow_set:
                    continue
                term_lc = term.lower()
                for line_no, frag in parser.fragments:
                    if term_lc in frag.lower():
                        findings.append({
                            "kind": "vocab",
                            "lang": lang,
                            "name": term,
                            "severity": severity,
                            "line": line_no,
                            "snippet": frag.strip()[:120],
                        })
                        seen.add(key)
                        break
    # Patterns
    for pat in blacklist.get("patterns") or []:
        name = pat.get("name") or pat.get("regex", "")[:40]
        key = ("pattern", name)
        if key in seen:
            continue
        if name in allow_set or name.lower() in allow_set:
            continue
        flags = re.MULTILINE
        if not pat.get("case_sensitive"):
            flags |= re.IGNORECASE
        try:
            rx = re.compile(pat["regex"], flags)
        except re.error as e:
            findings.append({
                "kind": "config",
                "name": name,
                "severity": "warn",
                "line": 0,
                "snippet": f"invalid regex: {e}",
            })
            continue
        for line_no, frag in parser.fragments:
            m = rx.search(frag)
            if m:
                findings.append({
                    "kind": "pattern",
                    "name": name,
                    "severity": pat.get("severity", "warn"),
                    "line": line_no,
                    "snippet": (frag.strip()[:120] or m.group(0)[:120]),
                })
                seen.add(key)
                break
    return findings
 def main() -> int:
    here = os.path.dirname(os.path.abspath(__file__))
    repo = os.path.dirname(here)
    ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.")
    ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build"))
    ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml"))
    ap.add_argument("--sources", default=os.path.join(repo, "sites"),
                    help="sites/ root (for per-site site.yaml allow lists)")
    ap.add_argument("--quiet", action="store_true",
                    help="Suppress warnings; only show fails.")
    ap.add_argument("--json", action="store_true", help="Emit JSON report.")
    args = ap.parse_args()
    if not os.path.isdir(args.build_dir):
        print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr)
        return 2
    if not os.path.isfile(args.blacklist):
        print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr)
        return 2
    blacklist = load_blacklist(args.blacklist)
    total_warn = 0
    total_fail = 0
    sites_with_findings = 0
    sites_total = 0
    json_sites: list[dict] = []
    for entry in sorted(os.listdir(args.build_dir)):
        site_dir = os.path.join(args.build_dir, entry)
        html = os.path.join(site_dir, "index.html")
        if not os.path.isfile(html):
            continue
        sites_total += 1
        site_yaml = os.path.join(args.sources, entry, "site.yaml")
        extra_allows = site_allow_yaml(site_yaml)
        findings = lint_file(html, blacklist, extra_allows)
        warns = [f for f in findings if f["severity"] == "warn"]
        fails = [f for f in findings if f["severity"] == "fail"]
        if findings:
            sites_with_findings += 1
            total_warn += len(warns)
            total_fail += len(fails)
            if args.json:
                json_sites.append({"site": entry, "findings": findings})
            else:
                visible = fails if args.quiet else findings
                if visible:
                    print(f"{BOLD}{entry}{RESET}")
                    for f in visible:
                        if f["severity"] == "fail":
                            color, tag = RED, "FAIL"
                        else:
                            color, tag = YELLOW, "warn"
                        lang = f" ({f['lang']})" if "lang" in f else ""
                        print(
                            f"  {color}{tag}{RESET} {f['kind']}{lang}: "
                            f"{BOLD}{f['name']}{RESET}  "
                            f"{DIM}line {f['line']}: {f['snippet']}{RESET}"
                        )
    if args.json:
        json.dump(
            {
                "summary": {
                    "sites_total": sites_total,
                    "sites_with_findings": sites_with_findings,
                    "warn": total_warn,
                    "fail": total_fail,
                },
                "sites": json_sites,
            },
            sys.stdout,
            indent=2,
            ensure_ascii=False,
        )
        print()
    else:
        if total_fail > 0:
            tag, color = "FAIL", RED
        elif total_warn > 0:
            tag, color = "WARN", YELLOW
        else:
            tag, color = "OK", GREEN
        print(
            f"\n{color}anti-ai-lint: {tag}{RESET} — "
            f"{sites_with_findings}/{sites_total} sites flagged "
            f"({total_fail} fail, {total_warn} warn)"
        )
    return 1 if total_fail > 0 else 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/tools/test-anti-ai-lint.sh
+++ b/tools/test-anti-ai-lint.sh
@@ -0,0 +1,107 @@
 #!/bin/bash
 # Self-test for tools/anti-ai-lint.py.
 # Builds a synthetic AI-text fixture in a temp dir, asserts the linter
 # flags it, then verifies whitelist comments suppress the hit.
 set -euo pipefail
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 LINT="$SCRIPT_DIR/anti-ai-lint.py"
 tmp=$(mktemp -d)
 trap 'rm -rf "$tmp"' EXIT
 mkdir -p "$tmp/build/synthetic-ai.test"
 cat > "$tmp/build/synthetic-ai.test/index.html" <<'HTML'
 <!DOCTYPE html>
 <html lang="en">
 <head>
  <title>Synthetic AI sample</title>
  <style>.foo { color: red; } /* leverage in CSS comment must be ignored */</style>
  <script>const x = "leverage"; // in JS, must be ignored</script>
 </head>
 <body>
  <h1>In today's evolving landscape</h1>
  <p>This is a comprehensive, robust, holistic solution that lets us leverage emerging trends.</p>
  <p>We delve into the intricate tapestry of AI to navigate this pivotal moment.</p>
  <h2>Challenges and Future Prospects</h2>
  <ul>
    <li>Effizienz: hoch — Skalierbarkeit: gut — Sicherheit: solide</li>
  </ul>
 </body>
 </html>
 HTML
 expect_finding() {
    # expect_finding <json> <name>
    python3 -c '
 import json, sys
 data = json.loads(sys.argv[1])
 target = sys.argv[2]
 hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target]
 if len(hits) != 1:
    print(f"expected exactly 1 finding for {target!r}, got {len(hits)}", file=sys.stderr)
    sys.exit(1)
 ' "$1" "$2"
 }
 expect_no_finding() {
    python3 -c '
 import json, sys
 data = json.loads(sys.argv[1])
 target = sys.argv[2]
 hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target]
 if hits:
    print(f"unexpected finding for {target!r}: {hits}", file=sys.stderr)
    sys.exit(1)
 ' "$1" "$2"
 }
 echo "[1] expecting FAIL on synthetic AI fixture..."
 report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) && rc=0 || rc=$?
 if [ "$rc" -ne 1 ]; then
    echo "FAIL: expected exit 1, got $rc" >&2
    echo "$report" >&2
    exit 1
 fi
 for term in "in today's evolving landscape" "Challenges and Future Prospects" \
            "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do
    expect_finding "$report" "$term" || exit 1
 done
 echo "  OK"
 echo "[2] expecting whitelist comment to suppress hits..."
 sed -i '4a\  <!-- anti-ai-allow: leverage, comprehensive, delve, em-dash-3-bullet -->' \
    "$tmp/build/synthetic-ai.test/index.html"
 report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) || true
 for term in "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do
    expect_no_finding "$report" "$term" || exit 1
 done
 # fail-level "in today's evolving landscape" should still be reported
 expect_finding "$report" "in today's evolving landscape" || exit 1
 echo "  OK"
 echo "[3] expecting fail-level hit still triggers exit 1..."
 python3 "$LINT" "$tmp/build" >/dev/null 2>&1 && rc=0 || rc=$?
 if [ "$rc" -ne 1 ]; then
    echo "FAIL: expected exit 1, got $rc" >&2
    exit 1
 fi
 echo "  OK"
 echo "[4] expecting clean exit on neutral fixture..."
 rm "$tmp/build/synthetic-ai.test/index.html"
 mkdir -p "$tmp/build/clean.test"
 echo '<!DOCTYPE html><html lang="de"><body><p>Ein einfacher Satz ohne KI-Vokabular.</p></body></html>' \
    > "$tmp/build/clean.test/index.html"
 rm -rf "$tmp/build/synthetic-ai.test"
 out=$(python3 "$LINT" "$tmp/build" 2>&1) && rc=0 || rc=$?
 if [ "$rc" -ne 0 ]; then
    echo "FAIL: clean fixture should exit 0, got $rc" >&2
    echo "$out"
    exit 1
 fi
 echo "  OK"
 echo
 echo "all tests passed"