diff --git a/Dockerfile b/Dockerfile index 18b1006..67f8ef8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM alpine:3.21 AS builder -RUN apk add --no-cache bash yq coreutils findutils +RUN apk add --no-cache bash yq coreutils findutils python3 WORKDIR /src COPY . . diff --git a/README.md b/README.md index ba03bcd..9b8857f 100644 --- a/README.md +++ b/README.md @@ -31,10 +31,32 @@ build/ # Generated output (gitignored) ### Build ```bash -./build.sh +./build.sh # build + anti-AI text lint +./build.sh --skip-lint # build only (emergencies) ``` -Requires `yq` for YAML parsing. Outputs to `build/` directory. +Requires `yq` for YAML parsing and `python3` for the lint step. Outputs to `build/`. + +### Anti-AI text lint + +Every build runs `tools/anti-ai-lint.py` against `build//index.html`, +flagging text fingerprints typical of LLM-generated content (vocab and structure +patterns from `tools/anti-ai-blacklist.yaml`). Severity `warn` prints a message; +`fail` aborts the build. + +Whitelist a hit: + +- HTML comment in the affected page: + `` +- Per-site override in `site.yaml`: + ```yaml + anti_ai_allow: + - revolutionär + - em-dash-3-bullet + ``` + +The blacklist source is `docs/geo-seo-guideline.md` §3.6. Test the linter with +`tools/test-anti-ai-lint.sh`. ### Deploy diff --git a/build.sh b/build.sh index c1d2bf4..d22dd6b 100755 --- a/build.sh +++ b/build.sh @@ -5,6 +5,19 @@ set -euo pipefail SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) BUILD_DIR="$SCRIPT_DIR/build" +skip_lint=0 +for arg in "$@"; do + case "$arg" in + --skip-lint) skip_lint=1 ;; + -h|--help) + echo "Usage: $0 [--skip-lint]" + echo " --skip-lint Skip the anti-AI text lint step (emergencies only)." + exit 0 + ;; + *) echo "Unknown argument: $arg" >&2; exit 2 ;; + esac +done + echo "=== Onepager Build ===" # Clean build directory @@ -50,6 +63,14 @@ echo "[3/3] Copying shared assets..." cp -r "$SCRIPT_DIR/shared" "$BUILD_DIR/shared" echo " -> shared/ copied" -# 4. Report -echo "[4/4] Build complete" +# 4. Anti-AI text lint +if [ "$skip_lint" -eq 1 ]; then + echo "[4/4] Anti-AI lint skipped (--skip-lint)" +elif ! command -v python3 >/dev/null 2>&1; then + echo "[4/4] python3 not found — skipping anti-AI lint" +else + echo "[4/4] Anti-AI text lint..." + python3 "$SCRIPT_DIR/tools/anti-ai-lint.py" "$BUILD_DIR" +fi + echo "=== Build complete: $count sites ===" diff --git a/docs/geo-seo-guideline.md b/docs/geo-seo-guideline.md index 56fe8e1..58ed01a 100644 --- a/docs/geo-seo-guideline.md +++ b/docs/geo-seo-guideline.md @@ -248,7 +248,7 @@ Mehrere dieser Wörter im selben Absatz sind das stärkste Tell. **Praktische Umsetzung:** -- Lint-Skript im Build (`scripts/anti-ai-lint.sh`) das Vokabel-Blacklist über alle `index.html` und `*.md` läuft, mit Schwellenwert (z. B. mehr als 3 Marker pro 500 Wörter → Warnung). +- Lint-Skript im Build: für onepager implementiert in `tools/anti-ai-lint.py` mit Vokabel-Blacklist in `tools/anti-ai-blacklist.yaml`. Läuft am Ende von `build.sh` über alle `build//index.html`. Severity `warn` (Build geht durch) vs. `fail` (Build bricht ab). Whitelist via ``-Kommentar oder `anti_ai_allow:`-Liste in `site.yaml`. Notfall-Override: `build.sh --skip-lint`. - Bei AI-generierten Drafts: bewusst gegen die Blacklist redigieren. - Vor Veröffentlichung laut lesen. Wenn es klingt wie ein Pressemitteilungs-Bot, ist es einer. diff --git a/tools/anti-ai-blacklist.yaml b/tools/anti-ai-blacklist.yaml new file mode 100644 index 0000000..6918fe0 --- /dev/null +++ b/tools/anti-ai-blacklist.yaml @@ -0,0 +1,97 @@ +# Anti-AI lint rules: textual fingerprints typical of LLM-generated content. +# +# Severity: +# warn — build proceeds, message printed +# fail — build aborts (exit 1) unless build.sh --skip-lint +# +# Whitelisting matches: +# In an HTML file: +# +# Per site (site.yaml): anti_ai_allow: +# - leverage +# - em-dash-3-bullet +# +# Vocab matches are case-insensitive substring matches against the visible +# text of the rendered HTML (script/style/comments stripped). Pattern matches +# are regex (Python re), case-insensitive by default, against the same. +# +# Source: docs/geo-seo-guideline.md §3.6 (Wikipedia AI-content signals). + +vocab: + de: + warn: + - "nahtlos" + - "robust" + - "umfassend" + - "ganzheitlich" + - "fungiert als" + - "dient als Brücke" + - "Symbiose" + - "im Bereich der" + - "in der heutigen schnelllebigen" + - "ein Meilenstein" + - "ein Beweis für" + - "hat Spuren hinterlassen" + - "Es ist wichtig zu erwähnen" + - "Es ist wichtig zu beachten" + - "bahnbrechend" + - "revolutionär" + fail: + - "in der sich entwickelnden Landschaft" + - "Herausforderungen und Zukunftsaussichten" + - "Herausforderungen und Perspektiven" + + en: + warn: + - "delve" + - "tapestry" + - "testament" + - "intricate" + - "garnered" + - "bolstered" + - "enduring" + - "robust" + - "comprehensive" + - "meticulous" + - "interplay" + - "pivotal" + - "underscore" + - "moreover" + - "furthermore" + - "additionally" + - "crucial" + - "showcasing" + - "highlighting" + - "leverage" + - "streamline" + - "holistic" + - "seamless" + - "unleash" + - "ecosystem" + - "in the realm of" + - "dive into" + - "It's important to note that" + - "It is important to note that" + - "In this article, we'll" + fail: + - "in today's evolving landscape" + - "in the ever-evolving landscape" + - "Challenges and Future Prospects" + +patterns: + - name: em-dash-3-bullet + description: | + Three "Word: text — Word: text — Word: …" segments in one block. + Classic AI bullet pattern. + regex: '(\w[\w\s]{0,30}:\s+[^—\n]{2,80}—\s*){2,}\w[\w\s]{0,30}:' + severity: warn + + - name: not-only-but-also + description: '"not only X, but also Y" / "nicht nur X, sondern auch Y" filler.' + regex: '\b(?:not only|nicht nur)\b[^.,;\n]{1,80}\b(?:but also|sondern auch)\b' + severity: warn + + - name: as-an-ai + description: Leftover AI self-disclosure. + regex: '\b(?:as an? (?:AI|language model)|als (?:eine?|eine\s+)?(?:KI|Sprachmodell))\b' + severity: fail diff --git a/tools/anti-ai-lint.py b/tools/anti-ai-lint.py new file mode 100755 index 0000000..fa0b82a --- /dev/null +++ b/tools/anti-ai-lint.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +"""anti-ai-lint — flag AI-text fingerprints in built sites. + +Reads tools/anti-ai-blacklist.yaml, walks build//index.html, prints +findings. Exits 1 if any finding has severity=fail, else 0. + +Usage: + tools/anti-ai-lint.py [--blacklist PATH] [--sources sites/] [--quiet] + [--json] [BUILD_DIR] +""" + +import argparse +import json +import os +import re +import subprocess +import sys +from html.parser import HTMLParser + + +def _ansi(code: str) -> str: + return code if sys.stdout.isatty() else "" + + +RED = _ansi("\033[31m") +YELLOW = _ansi("\033[33m") +GREEN = _ansi("\033[32m") +DIM = _ansi("\033[2m") +BOLD = _ansi("\033[1m") +RESET = _ansi("\033[0m") + + +class TextExtractor(HTMLParser): + """Extract visible text and per-site allow directives.""" + + SKIP_TAGS = {"script", "style", "noscript", "template"} + + def __init__(self) -> None: + super().__init__(convert_charrefs=True) + self.skip_depth = 0 + self.fragments: list[tuple[int, str]] = [] + self.allows: set[str] = set() + self.html_lang: str | None = None + + def handle_starttag(self, tag: str, attrs) -> None: + if tag == "html" and self.html_lang is None: + for k, v in attrs: + if k == "lang" and v: + self.html_lang = v.lower().split("-")[0] + break + if tag in self.SKIP_TAGS: + self.skip_depth += 1 + + def handle_startendtag(self, tag: str, attrs) -> None: + # Self-closing — never enters skip depth, no data either. + pass + + def handle_endtag(self, tag: str) -> None: + if tag in self.SKIP_TAGS and self.skip_depth > 0: + self.skip_depth -= 1 + + def handle_data(self, data: str) -> None: + if self.skip_depth == 0 and data.strip(): + line, _ = self.getpos() + self.fragments.append((line, data)) + + def handle_comment(self, data: str) -> None: + m = re.search(r"anti-ai-allow\s*:\s*(.+)", data, re.IGNORECASE) + if m: + for token in re.split(r"[,\s]+", m.group(1)): + token = token.strip() + if token: + self.allows.add(token) + self.allows.add(token.lower()) + + +def load_blacklist(path: str) -> dict: + """Convert YAML to JSON via yq, parse with stdlib json.""" + try: + out = subprocess.check_output( + ["yq", "-o=json", path], + stderr=subprocess.PIPE, + text=True, + ) + except FileNotFoundError: + sys.exit("ERROR: yq not found in PATH (required to parse YAML blacklist)") + except subprocess.CalledProcessError as e: + sys.exit(f"ERROR: yq failed to parse {path}: {e.stderr.strip()}") + return json.loads(out) + + +def site_allow_yaml(site_yaml: str) -> list[str]: + if not os.path.isfile(site_yaml): + return [] + try: + out = subprocess.check_output( + ["yq", "-r", "(.anti_ai_allow // []) | .[]", site_yaml], + stderr=subprocess.DEVNULL, + text=True, + ) + except subprocess.CalledProcessError: + return [] + return [line.strip() for line in out.splitlines() if line.strip()] + + +def lint_file(html_path: str, blacklist: dict, extra_allows: list[str]) -> list[dict]: + with open(html_path, "r", encoding="utf-8", errors="replace") as f: + raw = f.read() + + parser = TextExtractor() + try: + parser.feed(raw) + parser.close() + except Exception as e: + # Malformed HTML — record a single warning and skip. + return [{ + "kind": "parse", + "name": "html-parse-error", + "severity": "warn", + "line": 0, + "snippet": str(e)[:120], + }] + + allow_set = set(parser.allows) + for tok in extra_allows: + allow_set.add(tok) + allow_set.add(tok.lower()) + + findings: list[dict] = [] + seen: set[tuple[str, str]] = set() # (kind, name) — one report per file + + # Lint vocab in BOTH languages — sites may carry data-en attributes + # that surface translated text alongside the primary language. + vocab = blacklist.get("vocab") or {} + for lang in ("de", "en"): + bucket = vocab.get(lang) or {} + for severity in ("warn", "fail"): + for term in bucket.get(severity) or []: + key = ("vocab", term.lower()) + if key in seen: + continue + if term in allow_set or term.lower() in allow_set: + continue + term_lc = term.lower() + for line_no, frag in parser.fragments: + if term_lc in frag.lower(): + findings.append({ + "kind": "vocab", + "lang": lang, + "name": term, + "severity": severity, + "line": line_no, + "snippet": frag.strip()[:120], + }) + seen.add(key) + break + + # Patterns + for pat in blacklist.get("patterns") or []: + name = pat.get("name") or pat.get("regex", "")[:40] + key = ("pattern", name) + if key in seen: + continue + if name in allow_set or name.lower() in allow_set: + continue + flags = re.MULTILINE + if not pat.get("case_sensitive"): + flags |= re.IGNORECASE + try: + rx = re.compile(pat["regex"], flags) + except re.error as e: + findings.append({ + "kind": "config", + "name": name, + "severity": "warn", + "line": 0, + "snippet": f"invalid regex: {e}", + }) + continue + for line_no, frag in parser.fragments: + m = rx.search(frag) + if m: + findings.append({ + "kind": "pattern", + "name": name, + "severity": pat.get("severity", "warn"), + "line": line_no, + "snippet": (frag.strip()[:120] or m.group(0)[:120]), + }) + seen.add(key) + break + + return findings + + +def main() -> int: + here = os.path.dirname(os.path.abspath(__file__)) + repo = os.path.dirname(here) + + ap = argparse.ArgumentParser(description="Flag AI-text fingerprints in built sites.") + ap.add_argument("build_dir", nargs="?", default=os.path.join(repo, "build")) + ap.add_argument("--blacklist", default=os.path.join(here, "anti-ai-blacklist.yaml")) + ap.add_argument("--sources", default=os.path.join(repo, "sites"), + help="sites/ root (for per-site site.yaml allow lists)") + ap.add_argument("--quiet", action="store_true", + help="Suppress warnings; only show fails.") + ap.add_argument("--json", action="store_true", help="Emit JSON report.") + args = ap.parse_args() + + if not os.path.isdir(args.build_dir): + print(f"ERROR: build dir not found: {args.build_dir}", file=sys.stderr) + return 2 + if not os.path.isfile(args.blacklist): + print(f"ERROR: blacklist not found: {args.blacklist}", file=sys.stderr) + return 2 + + blacklist = load_blacklist(args.blacklist) + + total_warn = 0 + total_fail = 0 + sites_with_findings = 0 + sites_total = 0 + json_sites: list[dict] = [] + + for entry in sorted(os.listdir(args.build_dir)): + site_dir = os.path.join(args.build_dir, entry) + html = os.path.join(site_dir, "index.html") + if not os.path.isfile(html): + continue + sites_total += 1 + + site_yaml = os.path.join(args.sources, entry, "site.yaml") + extra_allows = site_allow_yaml(site_yaml) + + findings = lint_file(html, blacklist, extra_allows) + warns = [f for f in findings if f["severity"] == "warn"] + fails = [f for f in findings if f["severity"] == "fail"] + + if findings: + sites_with_findings += 1 + total_warn += len(warns) + total_fail += len(fails) + if args.json: + json_sites.append({"site": entry, "findings": findings}) + else: + visible = fails if args.quiet else findings + if visible: + print(f"{BOLD}{entry}{RESET}") + for f in visible: + if f["severity"] == "fail": + color, tag = RED, "FAIL" + else: + color, tag = YELLOW, "warn" + lang = f" ({f['lang']})" if "lang" in f else "" + print( + f" {color}{tag}{RESET} {f['kind']}{lang}: " + f"{BOLD}{f['name']}{RESET} " + f"{DIM}line {f['line']}: {f['snippet']}{RESET}" + ) + + if args.json: + json.dump( + { + "summary": { + "sites_total": sites_total, + "sites_with_findings": sites_with_findings, + "warn": total_warn, + "fail": total_fail, + }, + "sites": json_sites, + }, + sys.stdout, + indent=2, + ensure_ascii=False, + ) + print() + else: + if total_fail > 0: + tag, color = "FAIL", RED + elif total_warn > 0: + tag, color = "WARN", YELLOW + else: + tag, color = "OK", GREEN + print( + f"\n{color}anti-ai-lint: {tag}{RESET} — " + f"{sites_with_findings}/{sites_total} sites flagged " + f"({total_fail} fail, {total_warn} warn)" + ) + + return 1 if total_fail > 0 else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tools/test-anti-ai-lint.sh b/tools/test-anti-ai-lint.sh new file mode 100755 index 0000000..e862487 --- /dev/null +++ b/tools/test-anti-ai-lint.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# Self-test for tools/anti-ai-lint.py. +# Builds a synthetic AI-text fixture in a temp dir, asserts the linter +# flags it, then verifies whitelist comments suppress the hit. +set -euo pipefail + +SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd) +LINT="$SCRIPT_DIR/anti-ai-lint.py" + +tmp=$(mktemp -d) +trap 'rm -rf "$tmp"' EXIT + +mkdir -p "$tmp/build/synthetic-ai.test" + +cat > "$tmp/build/synthetic-ai.test/index.html" <<'HTML' + + + + Synthetic AI sample + + + + +

In today's evolving landscape

+

This is a comprehensive, robust, holistic solution that lets us leverage emerging trends.

+

We delve into the intricate tapestry of AI to navigate this pivotal moment.

+

Challenges and Future Prospects

+
    +
  • Effizienz: hoch — Skalierbarkeit: gut — Sicherheit: solide
  • +
+ + +HTML + +expect_finding() { + # expect_finding + python3 -c ' +import json, sys +data = json.loads(sys.argv[1]) +target = sys.argv[2] +hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target] +if len(hits) != 1: + print(f"expected exactly 1 finding for {target!r}, got {len(hits)}", file=sys.stderr) + sys.exit(1) +' "$1" "$2" +} + +expect_no_finding() { + python3 -c ' +import json, sys +data = json.loads(sys.argv[1]) +target = sys.argv[2] +hits = [f for site in data["sites"] for f in site["findings"] if f["name"] == target] +if hits: + print(f"unexpected finding for {target!r}: {hits}", file=sys.stderr) + sys.exit(1) +' "$1" "$2" +} + +echo "[1] expecting FAIL on synthetic AI fixture..." +report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) && rc=0 || rc=$? +if [ "$rc" -ne 1 ]; then + echo "FAIL: expected exit 1, got $rc" >&2 + echo "$report" >&2 + exit 1 +fi +for term in "in today's evolving landscape" "Challenges and Future Prospects" \ + "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do + expect_finding "$report" "$term" || exit 1 +done +echo " OK" + +echo "[2] expecting whitelist comment to suppress hits..." +sed -i '4a\ ' \ + "$tmp/build/synthetic-ai.test/index.html" +report=$(python3 "$LINT" --json "$tmp/build" 2>/dev/null) || true +for term in "leverage" "comprehensive" "delve" "em-dash-3-bullet"; do + expect_no_finding "$report" "$term" || exit 1 +done +# fail-level "in today's evolving landscape" should still be reported +expect_finding "$report" "in today's evolving landscape" || exit 1 +echo " OK" + +echo "[3] expecting fail-level hit still triggers exit 1..." +python3 "$LINT" "$tmp/build" >/dev/null 2>&1 && rc=0 || rc=$? +if [ "$rc" -ne 1 ]; then + echo "FAIL: expected exit 1, got $rc" >&2 + exit 1 +fi +echo " OK" + +echo "[4] expecting clean exit on neutral fixture..." +rm "$tmp/build/synthetic-ai.test/index.html" +mkdir -p "$tmp/build/clean.test" +echo '

Ein einfacher Satz ohne KI-Vokabular.

' \ + > "$tmp/build/clean.test/index.html" +rm -rf "$tmp/build/synthetic-ai.test" +out=$(python3 "$LINT" "$tmp/build" 2>&1) && rc=0 || rc=$? +if [ "$rc" -ne 0 ]; then + echo "FAIL: clean fixture should exit 0, got $rc" >&2 + echo "$out" + exit 1 +fi +echo " OK" + +echo +echo "all tests passed"