Merge: t-paliad-349 docforge slice 2 — composer + Carrier to pkg/docforge/docx (m/paliad#157)
Some checks failed
Paliad CI gate / build (push) Has been cancelled
Paliad CI gate / test-go (push) Has been cancelled
Paliad CI gate / deploy (push) Has been cancelled

This commit is contained in:
mAi
2026-05-29 15:10:33 +02:00
4 changed files with 692 additions and 580 deletions

View File

@@ -57,9 +57,3 @@ func ConvertDotmToDocx(dotmBytes []byte) ([]byte, error) { return docx.ConvertDo
// SanitiseSubmissionFileName cleans a string for use inside a download
// filename (strips path separators / quotes, ASCII-folds DE umlauts).
func SanitiseSubmissionFileName(s string) string { return docx.SanitiseSubmissionFileName(s) }
// xmlAttrEscape forwards to docx.XMLAttrEscape so submission_compose.go's
// hyperlink-rels inserts reuse the walker's exact attribute escaping
// without importing the docx package directly. Retires when the composer
// splice folds into pkg/docforge/docx (slice 2).
func xmlAttrEscape(s string) string { return docx.XMLAttrEscape(s) }

View File

@@ -1,93 +1,73 @@
package services
// Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 +
// §9.2). Assembles a base .docx and a draft's section rows into a
// merged .docx ready for export.
// Composer wrapper — bridges paliad's submission draft model
// (SubmissionSection + SubmissionBase) to the format-neutral docforge
// .docx composer (pkg/docforge/docx), extracted in slice 2 of the
// docforge train (t-paliad-349 / m/paliad#157).
//
// Pipeline (high-level):
// The full splice/assembly pipeline now lives in pkg/docforge/docx
// (compose.go): macro pre-pass, anchor-pair splicing, append-before-sectPr,
// hyperlink-rels patching, zip repack, and the final placeholder pass. This
// wrapper does the one thing the engine must not know about — mapping
// paliad's DB row types onto the neutral docx.Section / docx.Carrier
// inputs. Behaviour is byte-identical to the pre-extraction composer; the
// in-package compose_test still drives this wrapper end-to-end.
//
// 1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx).
// 2. Locate `word/document.xml` inside the zip; pull the body XML.
// 3. For each section in the draft (order_index ASC, included=true):
// render content_md_<lang> → OOXML via RenderMarkdownToOOXML using
// base.section_spec.stylemap.paragraph.
// 4. Splice the rendered OOXML into the base body. Two splice modes:
// - Anchor mode: when the body carries `{{#section:KEY}}` /
// `{{/section:KEY}}` marker pairs, replace the slot's content
// (including the anchor paragraphs themselves) with the rendered
// section.
// - Append mode: when no anchor pair is found for a section, the
// rendered OOXML appends at the end of the body, just before any
// `<w:sectPr>` element. Sections with `included=false` are
// dropped silently.
// 5. Strip any leftover unmatched anchor paragraphs.
// 6. Re-pack the document.xml into the zip, leaving every other part
// untouched.
// 7. Run the v1 SubmissionRenderer placeholder pass over the assembly
// so `{{path}}` placeholders inside section content (and inside
// the base's untouched chrome) get substituted by the merged bag.
// Cross-run merge in pass 2 handles autocorrect-fragmented
// placeholders the same as v1.
//
// Result: a fully-merged .docx. No new third-party Go dep — reuses
// archive/zip + the existing SubmissionRenderer.
// Slice note: the paragraph-level neutral document model (Document / Block
// / Slot) the PRD §3.2 sketches lands in slice 6, where the authoring
// importer and the format exporters actually consume it. Building it now,
// ahead of any consumer, would be speculative and would put the
// byte-identical guarantee at risk for no gain (PRD §4 B3 principle:
// extractions earn their keep this cycle).
import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
"regexp"
"sort"
"strings"
"time"
"mgit.msbls.de/m/paliad/pkg/docforge/docx"
)
// SubmissionComposer assembles base + sections into a final .docx.
// Stateless; safe for concurrent use.
// SubmissionComposer assembles a base + a draft's sections into a final
// .docx. Stateless; safe for concurrent use.
type SubmissionComposer struct {
renderer *SubmissionRenderer
inner *docx.Composer
}
// NewSubmissionComposer wires the composer. The renderer is required —
// a nil renderer is a programmer error and the composer panics at
// NewSubmissionComposer wires the composer. The renderer is required — a
// nil renderer is a programmer error and the composer panics at
// construction.
func NewSubmissionComposer(renderer *SubmissionRenderer) *SubmissionComposer {
if renderer == nil {
panic("submission composer: renderer required")
}
return &SubmissionComposer{renderer: renderer}
return &SubmissionComposer{inner: docx.NewComposer(renderer)}
}
// ComposeOptions carries the per-call composition inputs.
// ComposeOptions carries the per-call composition inputs in paliad's own
// terms (SubmissionSection rows + the SubmissionBase chrome).
type ComposeOptions struct {
// Sections are the draft's section rows in display order. The
// composer renders included sections; excluded rows are dropped.
// Caller is responsible for visibility — by the time the composer
// runs, the section rows have already been gated through
// SubmissionDraftService.Get + can_see_project.
// Sections are the draft's section rows in display order. Included
// sections render; excluded rows are dropped. The caller is
// responsible for visibility — by the time the composer runs the rows
// have already been gated through SubmissionDraftService.Get +
// can_see_project.
Sections []SubmissionSection
// Base supplies the document chrome (.docx body host) plus the
// stylemap for the MD walker. Must not be nil.
// Base supplies the document chrome plus the stylemap for the MD
// walker. Must not be nil.
Base *SubmissionBase
// BaseBytes is the raw .docx bytes for the base. Typically fetched
// BaseBytes is the raw .docx bytes for the base, typically fetched
// from Gitea via the existing template cache.
BaseBytes []byte
// Lang ('de' or 'en') selects which content_md_* column the
// composer reads per section. Defaults to 'de' if empty.
// Lang ('de' or 'en') selects which content_md_* column the composer
// reads per section. Defaults to 'de' if empty.
Lang string
// Vars is the merged placeholder bag the v1 renderer pass
// substitutes after the composer assembly. Passed straight through
// to SubmissionRenderer.Render.
// Vars is the merged placeholder bag the renderer pass substitutes
// after assembly.
Vars PlaceholderMap
// Missing translates an unbound placeholder key into the marker
// the lawyer sees in Word. Passed straight to the renderer.
// Missing translates an unbound placeholder key into the marker the
// lawyer sees in Word.
Missing MissingPlaceholderFn
}
@@ -96,512 +76,24 @@ func (c *SubmissionComposer) Compose(ctx context.Context, opts ComposeOptions) (
if opts.Base == nil {
return nil, fmt.Errorf("submission compose: base required")
}
_ = ctx // reserved for cancellation propagation in later slices
sections := opts.Sections
// Pre-pass: strip macros so the base reads as a plain .docx zip.
cleanBytes, err := ConvertDotmToDocx(opts.BaseBytes)
if err != nil {
return nil, fmt.Errorf("submission compose: convert base: %w", err)
}
// Locate + extract word/document.xml so we can splice in-place.
documentXML, otherParts, err := splitBaseZip(cleanBytes)
if err != nil {
return nil, err
}
// Per-compose hyperlink allocator. Each unique URL gets a fresh
// rId outside the base's existing namespace. The post-pass
// (patchDocumentXMLRels) writes the matching Relationship rows
// before the zip is repacked. Slice D adds inline `[label](url)`
// hyperlink support.
linkAlloc := newComposerLinkAllocator()
// Build the rendered-section map: section_key → OOXML span.
stylemap := opts.Base.SectionSpec.Stylemap
rendered := make(map[string]string, len(sections))
keptSections := make([]SubmissionSection, 0, len(sections))
for _, sec := range sections {
if !sec.Included {
continue
secs := make([]docx.Section, len(opts.Sections))
for i, s := range opts.Sections {
secs[i] = docx.Section{
Key: s.SectionKey,
OrderIndex: s.OrderIndex,
Included: s.Included,
ContentMDDE: s.ContentMDDE,
ContentMDEN: s.ContentMDEN,
}
md := sec.ContentMDDE
if strings.EqualFold(opts.Lang, "en") {
md = sec.ContentMDEN
}
rendered[sec.SectionKey] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc)
keptSections = append(keptSections, sec)
}
// Stable order — already sorted ascending by ListForDraft, but
// belt-and-braces in case the caller swaps the ordering policy
// later.
sort.SliceStable(keptSections, func(i, j int) bool {
return keptSections[i].OrderIndex < keptSections[j].OrderIndex
return c.inner.Compose(ctx, docx.ComposeOptions{
Sections: secs,
Carrier: docx.Carrier{
Bytes: opts.BaseBytes,
Stylemap: opts.Base.SectionSpec.Stylemap,
},
Lang: opts.Lang,
Vars: opts.Vars,
Missing: opts.Missing,
})
assembledBody := spliceSections(documentXML, rendered, keptSections, sections)
// Slice D hyperlink patch: when the walker emitted hyperlink rIds
// for inline `[label](url)` links, the base's
// word/_rels/document.xml.rels needs matching <Relationship>
// entries so Word can resolve the rIds. Mutates one zip part in
// otherParts (or appends if missing).
if linkAlloc.HasLinks() {
updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs())
if err != nil {
return nil, err
}
otherParts = updatedParts
}
// Re-pack into a zip with the assembled document.xml. All other
// parts (styles, fonts, headers, footers, theme, settings) pass
// through bit-for-bit at their original mtime + compression.
repacked, err := repackBaseZip(otherParts, assembledBody)
if err != nil {
return nil, err
}
// Final pass: substitute placeholders against the merged bag. The
// existing renderer handles cross-run fragmentation, the `{{rule.X}}`
// alias contract, and the missing-marker emission. Reusing it
// guarantees v1's placeholder grammar stays intact inside section
// content + base chrome.
merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing)
if err != nil {
return nil, fmt.Errorf("submission compose: placeholder pass: %w", err)
}
return merged, nil
}
// ─────────────────────────────────────────────────────────────────────
// Section splicing
// ─────────────────────────────────────────────────────────────────────
// Anchor markers as they appear inside a <w:t> text node. We don't
// need a full XML parse — finding the marker text inside the body is
// sufficient because:
// - {{ and }} are never legitimate document content (placeholders
// follow the same convention everywhere else in paliad).
// - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML
// special characters.
// - Each anchor lives in exactly one <w:t>...<w:t>, which lives in
// exactly one <w:r>...</w:r>, which lives in exactly one
// <w:p>...</w:p>. We expand from the marker outward to find the
// enclosing <w:p> span and drop the entire paragraph as part of
// the splice.
//
// RE2 has no lookahead, so the "find enclosing <w:p>" logic is
// implemented as manual byte-index search around the marker hit
// (anchorParagraphSpan below) rather than a single regex pattern.
const (
anchorOpenPrefix = "{{#section:"
anchorClosePrefix = "{{/section:"
anchorSuffix = "}}"
)
// anchorKeyRegex validates that the captured anchor key is a clean
// identifier. Keys that include other characters (which can't actually
// appear in our authored .docx) are treated as no match.
var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`)
// anchorPair records the byte span of one matched anchor pair inside
// the body — from the start of the opening anchor's <w:p> element
// through the end of the closing anchor's </w:p>.
type anchorPair struct {
key string
openStart int // start of <w:p> for the opening anchor
closeEnd int // index just past </w:p> for the closing anchor
}
// findAllAnchorPairs scans the body for matched open/close anchor
// pairs. Unbalanced markers (open without close, or vice versa) are
// dropped from the result. Returns pairs in body-order; each pair's
// span is non-overlapping.
func findAllAnchorPairs(body string) []anchorPair {
type marker struct {
key string
paraStart int
paraEnd int
isOpen bool
}
var markers []marker
collect := func(prefix string, isOpen bool) {
offset := 0
for {
idx := strings.Index(body[offset:], prefix)
if idx < 0 {
return
}
start := offset + idx
suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix)
if suffixIdx < 0 {
return
}
key := body[start+len(prefix) : start+len(prefix)+suffixIdx]
if !anchorKeyRegex.MatchString(key) {
offset = start + len(prefix)
continue
}
markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix)
pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd)
if !ok {
offset = markerEnd
continue
}
markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen})
offset = pEnd
}
}
collect(anchorOpenPrefix, true)
collect(anchorClosePrefix, false)
// Walk markers in body-order, matching each open with the next
// close that carries the same key.
sort.SliceStable(markers, func(i, j int) bool {
return markers[i].paraStart < markers[j].paraStart
})
var pairs []anchorPair
openStack := map[string]marker{}
for _, m := range markers {
if m.isOpen {
openStack[m.key] = m
continue
}
o, ok := openStack[m.key]
if !ok {
continue
}
pairs = append(pairs, anchorPair{
key: m.key,
openStart: o.paraStart,
closeEnd: m.paraEnd,
})
delete(openStack, m.key)
}
return pairs
}
// paragraphSpanAround returns the byte span of the smallest `<w:p>...</w:p>`
// element that fully contains the byte range [markerStart, markerEnd).
// Returns false when the byte range doesn't sit inside a single
// paragraph (which would mean the marker survived a cross-paragraph
// edit — defensive guard, shouldn't happen in well-formed input).
func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) {
// Walk backwards to find the nearest unclosed <w:p ... > opening.
// Since <w:p> doesn't nest, the nearest <w:p before markerStart is
// the enclosing paragraph's opening tag.
pStart := -1
cursor := markerStart
for cursor > 0 {
idx := strings.LastIndex(body[:cursor], "<w:p")
if idx < 0 {
break
}
// Confirm this is a paragraph open, not a different
// w:p-prefixed tag (e.g. <w:pPr>).
if idx+4 <= len(body) {
after := body[idx+4]
if after == ' ' || after == '>' || after == '/' {
// <w:p ...> or <w:p>; not <w:pPr>.
close := strings.Index(body[idx:], ">")
if close < 0 {
return 0, 0, false
}
pStart = idx
break
}
}
cursor = idx
}
if pStart < 0 {
return 0, 0, false
}
// Walk forward to find the matching </w:p>. <w:p> doesn't nest so
// the next </w:p> after the marker is the close.
pEndIdx := strings.Index(body[markerEnd:], "</w:p>")
if pEndIdx < 0 {
return 0, 0, false
}
pEnd := markerEnd + pEndIdx + len("</w:p>")
return pStart, pEnd, true
}
// spliceSections replaces anchor slots with rendered sections and
// appends any unanchored sections before sectPr. Returns the assembled
// document.xml body.
func spliceSections(documentXML []byte, rendered map[string]string, kept []SubmissionSection, all []SubmissionSection) []byte {
body := string(documentXML)
pairs := findAllAnchorPairs(body)
// Build a lookup of kept section keys for quick membership tests.
keptByKey := map[string]int{}
for i, sec := range kept {
keptByKey[sec.SectionKey] = i
}
allByKey := map[string]int{}
for i, sec := range all {
allByKey[sec.SectionKey] = i
}
matchedKeys := map[string]bool{}
// Walk pairs in REVERSE body-order so slice mutations don't shift
// later offsets.
sort.SliceStable(pairs, func(i, j int) bool {
return pairs[i].openStart > pairs[j].openStart
})
for _, p := range pairs {
replacement := ""
if idx, ok := keptByKey[p.key]; ok {
replacement = rendered[p.key]
matchedKeys[p.key] = true
_ = idx
} else if _, isOnDraft := allByKey[p.key]; isOnDraft {
// Anchor matches an excluded section on the draft — drop
// the entire slot.
replacement = ""
} else {
// Anchor doesn't match any section on this draft — drop
// to leave the base's chrome unbroken.
replacement = ""
}
body = body[:p.openStart] + replacement + body[p.closeEnd:]
}
// Append unanchored sections before sectPr in order_index ASC.
var unanchored strings.Builder
for _, sec := range kept {
if matchedKeys[sec.SectionKey] {
continue
}
unanchored.WriteString(rendered[sec.SectionKey])
}
if unanchored.Len() > 0 {
body = appendBeforeSectPr(body, unanchored.String())
}
return []byte(body)
}
// appendBeforeSectPr inserts content immediately before the first
// `<w:sectPr` element in the body, or at the end of the body if there
// is none. Word documents conventionally close the body with a sectPr
// describing page setup; we want to land sections before that element
// so they show up on the actual pages.
var sectPrRegex = regexp.MustCompile(`<w:sectPr\b`)
func appendBeforeSectPr(body, content string) string {
loc := sectPrRegex.FindStringIndex(body)
if loc == nil {
// No sectPr → append before `</w:body>` if present, else at
// the very end.
idx := strings.LastIndex(body, "</w:body>")
if idx < 0 {
return body + content
}
return body[:idx] + content + body[idx:]
}
return body[:loc[0]] + content + body[loc[0]:]
}
// ─────────────────────────────────────────────────────────────────────
// Zip plumbing
// ─────────────────────────────────────────────────────────────────────
// baseZipPart captures one zip entry we kept aside while extracting
// document.xml.
type baseZipPart struct {
name string
method uint16
modTime int64 // wall seconds; converted back to time.Time on repack
body []byte
}
// splitBaseZip extracts document.xml and returns it alongside every
// other zip entry, ready for repacking.
func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) {
zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes)))
if err != nil {
return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err)
}
var documentXML []byte
parts := make([]baseZipPart, 0, len(zr.File))
for _, f := range zr.File {
body, err := readZipEntry(f)
if err != nil {
return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err)
}
if f.Name == "word/document.xml" {
documentXML = body
parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil})
continue
}
parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body})
}
if documentXML == nil {
return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml")
}
return documentXML, parts, nil
}
// repackBaseZip rebuilds the zip, swapping document.xml for the
// assembled body and leaving every other part untouched.
func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) {
var out bytes.Buffer
zw := zip.NewWriter(&out)
for _, p := range parts {
hdr := &zip.FileHeader{
Name: p.name,
Method: p.method,
}
if p.modTime > 0 {
hdr.Modified = time.Unix(p.modTime, 0)
}
w, err := zw.CreateHeader(hdr)
if err != nil {
return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err)
}
body := p.body
if p.name == "word/document.xml" {
body = assembledBody
}
if _, err := w.Write(body); err != nil {
return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err)
}
}
if err := zw.Close(); err != nil {
return nil, fmt.Errorf("submission compose: finalise zip: %w", err)
}
return out.Bytes(), nil
}
func readZipEntry(f *zip.File) ([]byte, error) {
rc, err := f.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// ─────────────────────────────────────────────────────────────────────
// Slice D — hyperlink wiring
// ─────────────────────────────────────────────────────────────────────
// composerLinkAllocator hands out fresh rIds for inline hyperlink
// targets discovered by the MD walker. Each unique URL gets one rId
// (deduped — repeated links to the same URL share one Relationship).
// Allocations land outside the base's rId namespace by prefixing with
// "rIdComposer" so they can't collide with existing relationships.
type composerLinkAllocator struct {
next int
byURL map[string]string
order []string // URLs in allocation order
}
func newComposerLinkAllocator() *composerLinkAllocator {
return &composerLinkAllocator{byURL: map[string]string{}}
}
// Alloc returns the rId for url, allocating one on first sight.
func (a *composerLinkAllocator) Alloc(url string) string {
if rid, ok := a.byURL[url]; ok {
return rid
}
a.next++
rid := fmt.Sprintf("rIdComposer%d", a.next)
a.byURL[url] = rid
a.order = append(a.order, url)
return rid
}
// HasLinks reports whether any links were allocated during this compose.
func (a *composerLinkAllocator) HasLinks() bool {
return len(a.order) > 0
}
// Pairs returns the (rId, URL) pairs in allocation order. The
// document.xml.rels patcher consumes this to emit <Relationship>
// elements.
func (a *composerLinkAllocator) Pairs() [][2]string {
pairs := make([][2]string, 0, len(a.order))
for _, url := range a.order {
pairs = append(pairs, [2]string{a.byURL[url], url})
}
return pairs
}
// patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry
// in `parts` to append the given (rId, URL) pairs as hyperlink
// relationships. If the rels part doesn't exist (some bases omit it
// when the body has no relationships), this function appends a fresh
// part with the minimal Relationships wrapper.
//
// Idempotent on (rId, URL) pairs already present (e.g. when a base
// already references the URL for some other reason).
//
// Returns the (possibly extended) parts slice — callers must overwrite
// their reference because the append in the no-rels-yet case grows the
// backing array.
func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) {
const path = "word/_rels/document.xml.rels"
const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
existingIdx := -1
for i := range parts {
if parts[i].name == path {
existingIdx = i
break
}
}
var body string
if existingIdx >= 0 {
body = string(parts[existingIdx].body)
} else {
body = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>` +
`<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>`
}
var inserts strings.Builder
for _, p := range pairs {
rid := p[0]
url := p[1]
if strings.Contains(body, `Id="`+rid+`"`) {
continue
}
inserts.WriteString(`<Relationship Id="`)
inserts.WriteString(xmlAttrEscape(rid))
inserts.WriteString(`" Type="`)
inserts.WriteString(hyperlinkType)
inserts.WriteString(`" Target="`)
inserts.WriteString(xmlAttrEscape(url))
inserts.WriteString(`" TargetMode="External"/>`)
}
if inserts.Len() == 0 {
return parts, nil
}
closeIdx := strings.LastIndex(body, "</Relationships>")
if closeIdx < 0 {
return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)")
}
patched := body[:closeIdx] + inserts.String() + body[closeIdx:]
if existingIdx >= 0 {
parts[existingIdx].body = []byte(patched)
return parts, nil
}
parts = append(parts, baseZipPart{
name: path,
method: zip.Deflate,
modTime: time.Now().Unix(),
body: []byte(patched),
})
return parts, nil
}

View File

@@ -0,0 +1,634 @@
package docx
// Composer render pipeline — t-paliad-313 Slice B (design doc §9.1 +
// §9.2). Assembles a base .docx and a draft's section rows into a
// merged .docx ready for export.
//
// Pipeline (high-level):
//
// 1. ConvertDotmToDocx pre-pass on the base bytes (idempotent on .docx).
// 2. Locate `word/document.xml` inside the zip; pull the body XML.
// 3. For each section in the draft (order_index ASC, included=true):
// render content_md_<lang> → OOXML via RenderMarkdownToOOXML using
// base.section_spec.stylemap.paragraph.
// 4. Splice the rendered OOXML into the base body. Two splice modes:
// - Anchor mode: when the body carries `{{#section:KEY}}` /
// `{{/section:KEY}}` marker pairs, replace the slot's content
// (including the anchor paragraphs themselves) with the rendered
// section.
// - Append mode: when no anchor pair is found for a section, the
// rendered OOXML appends at the end of the body, just before any
// `<w:sectPr>` element. Sections with `included=false` are
// dropped silently.
// 5. Strip any leftover unmatched anchor paragraphs.
// 6. Re-pack the document.xml into the zip, leaving every other part
// untouched.
// 7. Run the v1 SubmissionRenderer placeholder pass over the assembly
// so `{{path}}` placeholders inside section content (and inside
// the base's untouched chrome) get substituted by the merged bag.
// Cross-run merge in pass 2 handles autocorrect-fragmented
// placeholders the same as v1.
//
// Result: a fully-merged .docx. No new third-party Go dep — reuses
// archive/zip + the existing SubmissionRenderer.
import (
"archive/zip"
"bytes"
"context"
"fmt"
"io"
"regexp"
"sort"
"strings"
"time"
)
// Composer assembles base + sections into a final .docx.
// Stateless; safe for concurrent use.
type Composer struct {
renderer *SubmissionRenderer
}
// NewComposer wires the composer. The renderer is required —
// a nil renderer is a programmer error and the composer panics at
// construction.
func NewComposer(renderer *SubmissionRenderer) *Composer {
if renderer == nil {
panic("submission composer: renderer required")
}
return &Composer{renderer: renderer}
}
// Carrier is the opaque base document the composer splices rendered
// content into. Its bytes are preserved verbatim outside the regions the
// splice touches — the {{#section:KEY}} anchor paragraphs and the
// {{placeholder}} tokens — so the firm's letterhead, styles, headers, and
// footers survive a compose byte-for-byte. This is the docforge "carrier"
// for the .docx format: the lossless host for editable content.
type Carrier struct {
// Bytes is the raw base .docx. May be a .dotm/.docm/.dotx; Compose
// runs ConvertDotmToDocx on it first (idempotent on a plain .docx).
Bytes []byte
// Stylemap maps a logical block kind (paragraph, heading_1/2/3,
// list_bullet, list_numbered, blockquote) to the Word paragraph
// style name the base defines for it. Drives the Markdown walker's
// <w:pStyle>. Missing entries fall back to the "paragraph" style.
Stylemap map[string]string
}
// Section is one editable content block the composer renders and splices.
// It is the format-neutral input the docforge engine consumes; the
// consuming application maps its own row type onto it (paliad maps
// SubmissionSection → Section).
type Section struct {
// Key matches a {{#section:KEY}} anchor in the carrier, or — when no
// anchor matches — marks an append-mode section.
Key string
// OrderIndex sets append-mode ordering (ascending).
OrderIndex int
// Included=false drops the section entirely.
Included bool
// ContentMDDE / ContentMDEN are the bilingual Markdown sources; Lang
// selects which one renders.
ContentMDDE string
ContentMDEN string
}
// ComposeOptions carries the per-call composition inputs.
type ComposeOptions struct {
// Sections are the draft's section rows in display order. The
// composer renders included sections; excluded rows are dropped.
// Caller is responsible for visibility — by the time the composer
// runs, the section rows have already been gated by the caller.
Sections []Section
// Carrier is the base .docx chrome plus its stylemap. Required.
Carrier Carrier
// Lang ('de' or 'en') selects which content_md_* column the
// composer reads per section. Defaults to 'de' if empty.
Lang string
// Vars is the merged placeholder bag the v1 renderer pass
// substitutes after the composer assembly. Passed straight through
// to SubmissionRenderer.Render.
Vars PlaceholderMap
// Missing translates an unbound placeholder key into the marker
// the lawyer sees in Word. Passed straight to the renderer.
Missing MissingPlaceholderFn
}
// Compose runs the full pipeline and returns the merged .docx bytes.
func (c *Composer) Compose(ctx context.Context, opts ComposeOptions) ([]byte, error) {
_ = ctx // reserved for cancellation propagation in later slices
sections := opts.Sections
// Pre-pass: strip macros so the base reads as a plain .docx zip.
cleanBytes, err := ConvertDotmToDocx(opts.Carrier.Bytes)
if err != nil {
return nil, fmt.Errorf("submission compose: convert base: %w", err)
}
// Locate + extract word/document.xml so we can splice in-place.
documentXML, otherParts, err := splitBaseZip(cleanBytes)
if err != nil {
return nil, err
}
// Per-compose hyperlink allocator. Each unique URL gets a fresh
// rId outside the base's existing namespace. The post-pass
// (patchDocumentXMLRels) writes the matching Relationship rows
// before the zip is repacked. Slice D adds inline `[label](url)`
// hyperlink support.
linkAlloc := newComposerLinkAllocator()
// Build the rendered-section map: section_key → OOXML span.
stylemap := opts.Carrier.Stylemap
rendered := make(map[string]string, len(sections))
keptSections := make([]Section, 0, len(sections))
for _, sec := range sections {
if !sec.Included {
continue
}
md := sec.ContentMDDE
if strings.EqualFold(opts.Lang, "en") {
md = sec.ContentMDEN
}
rendered[sec.Key] = RenderMarkdownToOOXMLWithStyles(md, stylemap, linkAlloc.Alloc)
keptSections = append(keptSections, sec)
}
// Stable order — already sorted ascending by ListForDraft, but
// belt-and-braces in case the caller swaps the ordering policy
// later.
sort.SliceStable(keptSections, func(i, j int) bool {
return keptSections[i].OrderIndex < keptSections[j].OrderIndex
})
assembledBody := spliceSections(documentXML, rendered, keptSections, sections)
// Slice D hyperlink patch: when the walker emitted hyperlink rIds
// for inline `[label](url)` links, the base's
// word/_rels/document.xml.rels needs matching <Relationship>
// entries so Word can resolve the rIds. Mutates one zip part in
// otherParts (or appends if missing).
if linkAlloc.HasLinks() {
updatedParts, err := patchDocumentXMLRels(otherParts, linkAlloc.Pairs())
if err != nil {
return nil, err
}
otherParts = updatedParts
}
// Re-pack into a zip with the assembled document.xml. All other
// parts (styles, fonts, headers, footers, theme, settings) pass
// through bit-for-bit at their original mtime + compression.
repacked, err := repackBaseZip(otherParts, assembledBody)
if err != nil {
return nil, err
}
// Final pass: substitute placeholders against the merged bag. The
// existing renderer handles cross-run fragmentation, the `{{rule.X}}`
// alias contract, and the missing-marker emission. Reusing it
// guarantees v1's placeholder grammar stays intact inside section
// content + base chrome.
merged, err := c.renderer.Render(repacked, opts.Vars, opts.Missing)
if err != nil {
return nil, fmt.Errorf("submission compose: placeholder pass: %w", err)
}
return merged, nil
}
// ─────────────────────────────────────────────────────────────────────
// Section splicing
// ─────────────────────────────────────────────────────────────────────
// Anchor markers as they appear inside a <w:t> text node. We don't
// need a full XML parse — finding the marker text inside the body is
// sufficient because:
// - {{ and }} are never legitimate document content (placeholders
// follow the same convention everywhere else in paliad).
// - The anchor key grammar [A-Za-z0-9_]+ rules out any HTML/XML
// special characters.
// - Each anchor lives in exactly one <w:t>...<w:t>, which lives in
// exactly one <w:r>...</w:r>, which lives in exactly one
// <w:p>...</w:p>. We expand from the marker outward to find the
// enclosing <w:p> span and drop the entire paragraph as part of
// the splice.
//
// RE2 has no lookahead, so the "find enclosing <w:p>" logic is
// implemented as manual byte-index search around the marker hit
// (anchorParagraphSpan below) rather than a single regex pattern.
const (
anchorOpenPrefix = "{{#section:"
anchorClosePrefix = "{{/section:"
anchorSuffix = "}}"
)
// anchorKeyRegex validates that the captured anchor key is a clean
// identifier. Keys that include other characters (which can't actually
// appear in our authored .docx) are treated as no match.
var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`)
// anchorPair records the byte span of one matched anchor pair inside
// the body — from the start of the opening anchor's <w:p> element
// through the end of the closing anchor's </w:p>.
type anchorPair struct {
key string
openStart int // start of <w:p> for the opening anchor
closeEnd int // index just past </w:p> for the closing anchor
}
// findAllAnchorPairs scans the body for matched open/close anchor
// pairs. Unbalanced markers (open without close, or vice versa) are
// dropped from the result. Returns pairs in body-order; each pair's
// span is non-overlapping.
func findAllAnchorPairs(body string) []anchorPair {
type marker struct {
key string
paraStart int
paraEnd int
isOpen bool
}
var markers []marker
collect := func(prefix string, isOpen bool) {
offset := 0
for {
idx := strings.Index(body[offset:], prefix)
if idx < 0 {
return
}
start := offset + idx
suffixIdx := strings.Index(body[start+len(prefix):], anchorSuffix)
if suffixIdx < 0 {
return
}
key := body[start+len(prefix) : start+len(prefix)+suffixIdx]
if !anchorKeyRegex.MatchString(key) {
offset = start + len(prefix)
continue
}
markerEnd := start + len(prefix) + suffixIdx + len(anchorSuffix)
pStart, pEnd, ok := paragraphSpanAround(body, start, markerEnd)
if !ok {
offset = markerEnd
continue
}
markers = append(markers, marker{key: key, paraStart: pStart, paraEnd: pEnd, isOpen: isOpen})
offset = pEnd
}
}
collect(anchorOpenPrefix, true)
collect(anchorClosePrefix, false)
// Walk markers in body-order, matching each open with the next
// close that carries the same key.
sort.SliceStable(markers, func(i, j int) bool {
return markers[i].paraStart < markers[j].paraStart
})
var pairs []anchorPair
openStack := map[string]marker{}
for _, m := range markers {
if m.isOpen {
openStack[m.key] = m
continue
}
o, ok := openStack[m.key]
if !ok {
continue
}
pairs = append(pairs, anchorPair{
key: m.key,
openStart: o.paraStart,
closeEnd: m.paraEnd,
})
delete(openStack, m.key)
}
return pairs
}
// paragraphSpanAround returns the byte span of the smallest `<w:p>...</w:p>`
// element that fully contains the byte range [markerStart, markerEnd).
// Returns false when the byte range doesn't sit inside a single
// paragraph (which would mean the marker survived a cross-paragraph
// edit — defensive guard, shouldn't happen in well-formed input).
func paragraphSpanAround(body string, markerStart, markerEnd int) (int, int, bool) {
// Walk backwards to find the nearest unclosed <w:p ... > opening.
// Since <w:p> doesn't nest, the nearest <w:p before markerStart is
// the enclosing paragraph's opening tag.
pStart := -1
cursor := markerStart
for cursor > 0 {
idx := strings.LastIndex(body[:cursor], "<w:p")
if idx < 0 {
break
}
// Confirm this is a paragraph open, not a different
// w:p-prefixed tag (e.g. <w:pPr>).
if idx+4 <= len(body) {
after := body[idx+4]
if after == ' ' || after == '>' || after == '/' {
// <w:p ...> or <w:p>; not <w:pPr>.
close := strings.Index(body[idx:], ">")
if close < 0 {
return 0, 0, false
}
pStart = idx
break
}
}
cursor = idx
}
if pStart < 0 {
return 0, 0, false
}
// Walk forward to find the matching </w:p>. <w:p> doesn't nest so
// the next </w:p> after the marker is the close.
pEndIdx := strings.Index(body[markerEnd:], "</w:p>")
if pEndIdx < 0 {
return 0, 0, false
}
pEnd := markerEnd + pEndIdx + len("</w:p>")
return pStart, pEnd, true
}
// spliceSections replaces anchor slots with rendered sections and
// appends any unanchored sections before sectPr. Returns the assembled
// document.xml body.
func spliceSections(documentXML []byte, rendered map[string]string, kept []Section, all []Section) []byte {
body := string(documentXML)
pairs := findAllAnchorPairs(body)
// Build a lookup of kept section keys for quick membership tests.
keptByKey := map[string]int{}
for i, sec := range kept {
keptByKey[sec.Key] = i
}
allByKey := map[string]int{}
for i, sec := range all {
allByKey[sec.Key] = i
}
matchedKeys := map[string]bool{}
// Walk pairs in REVERSE body-order so slice mutations don't shift
// later offsets.
sort.SliceStable(pairs, func(i, j int) bool {
return pairs[i].openStart > pairs[j].openStart
})
for _, p := range pairs {
replacement := ""
if idx, ok := keptByKey[p.key]; ok {
replacement = rendered[p.key]
matchedKeys[p.key] = true
_ = idx
} else if _, isOnDraft := allByKey[p.key]; isOnDraft {
// Anchor matches an excluded section on the draft — drop
// the entire slot.
replacement = ""
} else {
// Anchor doesn't match any section on this draft — drop
// to leave the base's chrome unbroken.
replacement = ""
}
body = body[:p.openStart] + replacement + body[p.closeEnd:]
}
// Append unanchored sections before sectPr in order_index ASC.
var unanchored strings.Builder
for _, sec := range kept {
if matchedKeys[sec.Key] {
continue
}
unanchored.WriteString(rendered[sec.Key])
}
if unanchored.Len() > 0 {
body = appendBeforeSectPr(body, unanchored.String())
}
return []byte(body)
}
// appendBeforeSectPr inserts content immediately before the first
// `<w:sectPr` element in the body, or at the end of the body if there
// is none. Word documents conventionally close the body with a sectPr
// describing page setup; we want to land sections before that element
// so they show up on the actual pages.
var sectPrRegex = regexp.MustCompile(`<w:sectPr\b`)
func appendBeforeSectPr(body, content string) string {
loc := sectPrRegex.FindStringIndex(body)
if loc == nil {
// No sectPr → append before `</w:body>` if present, else at
// the very end.
idx := strings.LastIndex(body, "</w:body>")
if idx < 0 {
return body + content
}
return body[:idx] + content + body[idx:]
}
return body[:loc[0]] + content + body[loc[0]:]
}
// ─────────────────────────────────────────────────────────────────────
// Zip plumbing
// ─────────────────────────────────────────────────────────────────────
// baseZipPart captures one zip entry we kept aside while extracting
// document.xml.
type baseZipPart struct {
name string
method uint16
modTime int64 // wall seconds; converted back to time.Time on repack
body []byte
}
// splitBaseZip extracts document.xml and returns it alongside every
// other zip entry, ready for repacking.
func splitBaseZip(cleanBytes []byte) ([]byte, []baseZipPart, error) {
zr, err := zip.NewReader(bytes.NewReader(cleanBytes), int64(len(cleanBytes)))
if err != nil {
return nil, nil, fmt.Errorf("submission compose: open base zip: %w", err)
}
var documentXML []byte
parts := make([]baseZipPart, 0, len(zr.File))
for _, f := range zr.File {
body, err := readZipEntry(f)
if err != nil {
return nil, nil, fmt.Errorf("submission compose: read %s: %w", f.Name, err)
}
if f.Name == "word/document.xml" {
documentXML = body
parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: nil})
continue
}
parts = append(parts, baseZipPart{name: f.Name, method: f.Method, modTime: f.Modified.Unix(), body: body})
}
if documentXML == nil {
return nil, nil, fmt.Errorf("submission compose: base zip missing word/document.xml")
}
return documentXML, parts, nil
}
// repackBaseZip rebuilds the zip, swapping document.xml for the
// assembled body and leaving every other part untouched.
func repackBaseZip(parts []baseZipPart, assembledBody []byte) ([]byte, error) {
var out bytes.Buffer
zw := zip.NewWriter(&out)
for _, p := range parts {
hdr := &zip.FileHeader{
Name: p.name,
Method: p.method,
}
if p.modTime > 0 {
hdr.Modified = time.Unix(p.modTime, 0)
}
w, err := zw.CreateHeader(hdr)
if err != nil {
return nil, fmt.Errorf("submission compose: write header %s: %w", p.name, err)
}
body := p.body
if p.name == "word/document.xml" {
body = assembledBody
}
if _, err := w.Write(body); err != nil {
return nil, fmt.Errorf("submission compose: write body %s: %w", p.name, err)
}
}
if err := zw.Close(); err != nil {
return nil, fmt.Errorf("submission compose: finalise zip: %w", err)
}
return out.Bytes(), nil
}
func readZipEntry(f *zip.File) ([]byte, error) {
rc, err := f.Open()
if err != nil {
return nil, err
}
defer rc.Close()
return io.ReadAll(rc)
}
// ─────────────────────────────────────────────────────────────────────
// Slice D — hyperlink wiring
// ─────────────────────────────────────────────────────────────────────
// composerLinkAllocator hands out fresh rIds for inline hyperlink
// targets discovered by the MD walker. Each unique URL gets one rId
// (deduped — repeated links to the same URL share one Relationship).
// Allocations land outside the base's rId namespace by prefixing with
// "rIdComposer" so they can't collide with existing relationships.
type composerLinkAllocator struct {
next int
byURL map[string]string
order []string // URLs in allocation order
}
func newComposerLinkAllocator() *composerLinkAllocator {
return &composerLinkAllocator{byURL: map[string]string{}}
}
// Alloc returns the rId for url, allocating one on first sight.
func (a *composerLinkAllocator) Alloc(url string) string {
if rid, ok := a.byURL[url]; ok {
return rid
}
a.next++
rid := fmt.Sprintf("rIdComposer%d", a.next)
a.byURL[url] = rid
a.order = append(a.order, url)
return rid
}
// HasLinks reports whether any links were allocated during this compose.
func (a *composerLinkAllocator) HasLinks() bool {
return len(a.order) > 0
}
// Pairs returns the (rId, URL) pairs in allocation order. The
// document.xml.rels patcher consumes this to emit <Relationship>
// elements.
func (a *composerLinkAllocator) Pairs() [][2]string {
pairs := make([][2]string, 0, len(a.order))
for _, url := range a.order {
pairs = append(pairs, [2]string{a.byURL[url], url})
}
return pairs
}
// patchDocumentXMLRels mutates the word/_rels/document.xml.rels entry
// in `parts` to append the given (rId, URL) pairs as hyperlink
// relationships. If the rels part doesn't exist (some bases omit it
// when the body has no relationships), this function appends a fresh
// part with the minimal Relationships wrapper.
//
// Idempotent on (rId, URL) pairs already present (e.g. when a base
// already references the URL for some other reason).
//
// Returns the (possibly extended) parts slice — callers must overwrite
// their reference because the append in the no-rels-yet case grows the
// backing array.
func patchDocumentXMLRels(parts []baseZipPart, pairs [][2]string) ([]baseZipPart, error) {
const path = "word/_rels/document.xml.rels"
const hyperlinkType = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
existingIdx := -1
for i := range parts {
if parts[i].name == path {
existingIdx = i
break
}
}
var body string
if existingIdx >= 0 {
body = string(parts[existingIdx].body)
} else {
body = `<?xml version="1.0" encoding="UTF-8" standalone="yes"?>` +
`<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"></Relationships>`
}
var inserts strings.Builder
for _, p := range pairs {
rid := p[0]
url := p[1]
if strings.Contains(body, `Id="`+rid+`"`) {
continue
}
inserts.WriteString(`<Relationship Id="`)
inserts.WriteString(xmlAttrEscape(rid))
inserts.WriteString(`" Type="`)
inserts.WriteString(hyperlinkType)
inserts.WriteString(`" Target="`)
inserts.WriteString(xmlAttrEscape(url))
inserts.WriteString(`" TargetMode="External"/>`)
}
if inserts.Len() == 0 {
return parts, nil
}
closeIdx := strings.LastIndex(body, "</Relationships>")
if closeIdx < 0 {
return parts, fmt.Errorf("submission compose: malformed document.xml.rels (no closing tag)")
}
patched := body[:closeIdx] + inserts.String() + body[closeIdx:]
if existingIdx >= 0 {
parts[existingIdx].body = []byte(patched)
return parts, nil
}
parts = append(parts, baseZipPart{
name: path,
method: zip.Deflate,
modTime: time.Now().Unix(),
body: []byte(patched),
})
return parts, nil
}

View File

@@ -492,14 +492,6 @@ func xmlTextEscape(s string) string {
return s
}
// XMLAttrEscape is the exported form of xmlAttrEscape, used by the
// paliad-side composer (submission_compose.go) when it builds hyperlink
// relationship inserts. It exists so the composer can reuse the exact
// attribute-escaping the walker applies without reaching across the
// package boundary for an unexported helper. Slice 2 folds the
// composer's splice into this package, after which the wrapper retires.
func XMLAttrEscape(s string) string { return xmlAttrEscape(s) }
// xmlAttrEscape escapes for safe insertion into an attribute value
// (e.g. `<w:pStyle w:val="…"/>`).
func xmlAttrEscape(s string) string {