diff --git a/pkg/docforge/docx/compose.go b/pkg/docforge/docx/compose.go
index a343d3a..4beb704 100644
--- a/pkg/docforge/docx/compose.go
+++ b/pkg/docforge/docx/compose.go
@@ -240,7 +240,7 @@ var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`)
// the body — from the start of the opening anchor's element
// through the end of the closing anchor's .
type anchorPair struct {
- key string
+ key string
openStart int // start of for the opening anchor
closeEnd int // index just past for the closing anchor
}
@@ -251,10 +251,10 @@ type anchorPair struct {
// span is non-overlapping.
func findAllAnchorPairs(body string) []anchorPair {
type marker struct {
- key string
+ key string
paraStart int
paraEnd int
- isOpen bool
+ isOpen bool
}
var markers []marker
diff --git a/pkg/docforge/docx/exporter.go b/pkg/docforge/docx/exporter.go
new file mode 100644
index 0000000..67c3c3c
--- /dev/null
+++ b/pkg/docforge/docx/exporter.go
@@ -0,0 +1,39 @@
+package docx
+
+import "mgit.msbls.de/m/paliad/pkg/docforge"
+
+// Exporter is the .docx implementation of docforge.Exporter — it renders a
+// neutral Document to OOXML body markup (t-paliad-349 slice 8). The
+// stylemap (block kind → Word paragraph style) and the optional hyperlink
+// allocator are baked in at construction, so RenderBody matches the
+// interface's format-neutral signature.
+//
+// This is the seam a future PDF/HTML exporter slots into: implement
+// docforge.Exporter, no engine change. The submission composer can render
+// section content through this exporter instead of calling
+// RenderDocumentToOOXML directly once a second format exists.
+type Exporter struct {
+ Stylemap map[string]string
+ Links HyperlinkAllocator
+}
+
+// compile-time conformance.
+var _ docforge.Exporter = Exporter{}
+
+// NewExporter builds a .docx exporter with the given stylemap + allocator.
+func NewExporter(stylemap map[string]string, links HyperlinkAllocator) Exporter {
+ return Exporter{Stylemap: stylemap, Links: links}
+}
+
+// Format returns the format id.
+func (Exporter) Format() string { return "docx" }
+
+// MIMEType returns the .docx container MIME type.
+func (Exporter) MIMEType() string {
+ return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+}
+
+// RenderBody renders the Document to OOXML paragraph markup.
+func (e Exporter) RenderBody(doc docforge.Document) ([]byte, error) {
+ return []byte(RenderDocumentToOOXML(doc, e.Stylemap, e.Links)), nil
+}
diff --git a/pkg/docforge/docx/exporter_test.go b/pkg/docforge/docx/exporter_test.go
new file mode 100644
index 0000000..ad5a48e
--- /dev/null
+++ b/pkg/docforge/docx/exporter_test.go
@@ -0,0 +1,34 @@
+package docx
+
+import (
+ "strings"
+ "testing"
+
+ "mgit.msbls.de/m/paliad/pkg/docforge"
+ "mgit.msbls.de/m/paliad/pkg/docforge/markdown"
+)
+
+func TestExporter_RenderBodyMatchesWalker(t *testing.T) {
+ exp := NewExporter(map[string]string{"paragraph": "Body"}, nil)
+ if exp.Format() != "docx" {
+ t.Errorf("Format = %q; want docx", exp.Format())
+ }
+ if !strings.Contains(exp.MIMEType(), "wordprocessingml.document") {
+ t.Errorf("MIMEType = %q", exp.MIMEType())
+ }
+
+ md := "Hello **world**\n\n- item"
+ // The Exporter must produce exactly what the walker entry point does
+ // for the same input (both go markdown.Import → RenderDocumentToOOXML).
+ body, err := exp.RenderBody(markdown.Import(md))
+ if err != nil {
+ t.Fatalf("RenderBody: %v", err)
+ }
+ want := RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": "Body"}, nil)
+ if string(body) != want {
+ t.Errorf("RenderBody mismatch:\n got %q\nwant %q", body, want)
+ }
+}
+
+// satisfies the interface (compile-time check mirrored at runtime).
+var _ docforge.Exporter = Exporter{}
diff --git a/pkg/docforge/docx/markdown.go b/pkg/docforge/docx/markdown.go
index 812b01a..d2585ac 100644
--- a/pkg/docforge/docx/markdown.go
+++ b/pkg/docforge/docx/markdown.go
@@ -1,249 +1,78 @@
package docx
-// Markdown → OOXML walker for Composer section content (t-paliad-313
-// Slice B, design doc §9.2).
+// Markdown → OOXML rendering for Composer section content (t-paliad-313
+// Slice B/D; restructured in t-paliad-349 slice 8).
//
-// Scope per the head's Slice B brief: paragraphs + inline bold/italic
-// only. Headings, lists, blockquote, links land in Slice D's rich-prose
-// pass. This walker is intentionally minimal — every Markdown construct
-// it doesn't recognise is rendered as a plain paragraph so the lawyer's
-// prose round-trips losslessly even when they hit Markdown the walker
-// doesn't yet understand.
+// Parsing now lives in pkg/docforge/markdown, which produces the neutral
+// docforge.Document. This file renders that Document into OOXML paragraph
+// elements (…) ready to splice into a .docx body. There is one
+// Markdown parser for docforge; this is the .docx exporter for its model.
//
-// The output uses the base's stylemap.paragraph entry for the
-// on each paragraph so the styling matches the base's
-// typography (HLpat-Body-B0 on the HLC base, Normal on the neutral
-// base, etc.).
-//
-// Placeholders ({{path.dot.notation}}) are preserved verbatim — they
-// pass through the walker untouched and get substituted by the v1
-// SubmissionRenderer's placeholder pass after the composer assembly.
-//
-// Grammar supported:
-//
-// - Blank line → paragraph break
-// - `**bold**` → …
-// - `*italic*` or `_italic_` → …
-// - Otherwise → plain text run
+// Output uses the base's stylemap entry for each block kind on the
+// , so styling matches the base's typography (HLpat-Body-B0 on
+// the HLC base, Normal on the neutral base, etc.). Placeholders ({{key}})
+// ride through as literal run text and are substituted by the placeholder
+// pass after assembly.
import (
- "fmt"
+ "strconv"
"strings"
+
+ "mgit.msbls.de/m/paliad/pkg/docforge"
+ "mgit.msbls.de/m/paliad/pkg/docforge/markdown"
)
-// HyperlinkAllocator hands the walker a `rId` for each external URL
-// it encounters in `[label](url)` inline links. The composer's
-// post-pass uses these allocations to mutate
-// `word/_rels/document.xml.rels` so the emitted `` elements resolve correctly. Pass nil to drop links to
-// plain text (the label survives, the URL doesn't render).
-//
-// t-paliad-316 Slice D.
+// HyperlinkAllocator hands the renderer a `rId` for each external URL it
+// encounters in `[label](url)` inline links. The composer's post-pass uses
+// these allocations to mutate `word/_rels/document.xml.rels` so the emitted
+// `` elements resolve. Pass nil to drop links to
+// plain text (the label survives, the URL doesn't render). t-paliad-316.
type HyperlinkAllocator func(url string) string
-// RenderMarkdownToOOXML renders the given Markdown source into OOXML
-// paragraph elements (`…`), suitable for splicing into a
-// .docx body. Each paragraph carries ``
-// when paragraphStyle is non-empty.
-//
-// Slice B shipped paragraphs + bold/italic. Slice D extends to
-// headings (h1/h2/h3), bullet/numbered lists, blockquote, and inline
-// hyperlinks via the optional HyperlinkAllocator.
-//
-// stylemap supplies the paragraph-style names for each kind:
-// stylemap["paragraph"] — default body
-// stylemap["heading_1/2/3"] — heading levels
-// stylemap["list_bullet"] — bullet list paragraph style
-// stylemap["list_numbered"] — numbered list paragraph style
-// stylemap["blockquote"] — blockquote
-// Missing entries fall back to the "paragraph" style.
-//
-// Empty input renders one empty paragraph so the splice site is
-// well-formed even when the lawyer hasn't typed anything in this
-// section.
+// RenderMarkdownToOOXML renders Markdown into OOXML paragraphs with a
+// single paragraph style. Slice B back-compat wrapper.
func RenderMarkdownToOOXML(md, paragraphStyle string) string {
return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil)
}
-// RenderMarkdownToOOXMLWithStyles is the full Slice-D-aware entry
-// point. Slice B's RenderMarkdownToOOXML is a wrapper for back-compat.
+// RenderMarkdownToOOXMLWithStyles parses Markdown into a docforge.Document
+// and renders it to OOXML. stylemap maps each block kind (paragraph,
+// heading_1/2/3, list_bullet, list_numbered, blockquote) to a Word
+// paragraph style; missing entries fall back to the "paragraph" style.
func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string {
+ return RenderDocumentToOOXML(markdown.Import(md), stylemap, links)
+}
+
+// RenderDocumentToOOXML renders a neutral Document to OOXML paragraphs —
+// the .docx side of the docforge importer→model→exporter pipeline. Any
+// Document (Markdown today, a foreign-doc importer later) renders the same
+// way.
+func RenderDocumentToOOXML(doc docforge.Document, stylemap map[string]string, links HyperlinkAllocator) string {
defaultStyle := stylemap["paragraph"]
- if md == "" {
- return emptyParagraph(defaultStyle)
- }
- blocks := splitMarkdownBlocks(md)
- if len(blocks) == 0 {
- return emptyParagraph(defaultStyle)
- }
// Numbered-list counter resets on every non-numbered block so
- // "1. A\n2. B\n\n1. C" renders as 1./2./1. (the lawyer's input
- // determined the ordinal, the walker just renders).
- numberedCounter := 0
+ // "1. A\n2. B\n\n1. C" renders 1./2./1. — the input determined the
+ // ordinal, the renderer just emits it.
+ numbered := 0
var b strings.Builder
- for _, blk := range blocks {
- style := stylemap[blk.styleKey]
+ for _, blk := range doc.Blocks {
+ style := stylemap[string(blk.Kind)]
if style == "" {
style = defaultStyle
}
- if blk.styleKey == "list_numbered" {
- numberedCounter++
+ if blk.Kind == docforge.KindListNumbered {
+ numbered++
} else {
- numberedCounter = 0
+ numbered = 0
}
- b.WriteString(renderBlockParagraph(blk, style, links, numberedCounter))
+ b.WriteString(renderBlock(blk, style, links, numbered))
}
return b.String()
}
-// mdBlock is one rendered paragraph: a kind (paragraph / heading_*
-// / list_bullet / list_numbered / blockquote) and the inline content
-// text. List markers, heading hashes, blockquote `> ` etc. are
-// stripped from the text before storage.
-type mdBlock struct {
- styleKey string // "paragraph" | "heading_1" | "heading_2" | "heading_3" | "list_bullet" | "list_numbered" | "blockquote"
- text string
-}
-
-// splitMarkdownBlocks parses the source into a sequence of blocks,
-// detecting heading / list / blockquote prefixes line-by-line. Blank
-// lines split paragraph runs (same semantics as splitMarkdownParagraphs)
-// but each line is also tagged with its block kind.
-//
-// Lines that look like block markers don't merge with their neighbours
-// even across blank lines — every list / heading / blockquote line is
-// its own block in the output. A run of unmarked lines collapses into
-// one "paragraph" block (so soft line breaks inside a paragraph still
-// concatenate).
-//
-// CRLF normalised to LF before parsing.
-func splitMarkdownBlocks(md string) []mdBlock {
- normalised := strings.ReplaceAll(md, "\r\n", "\n")
- lines := strings.Split(normalised, "\n")
- var blocks []mdBlock
- var pendingPara []string
- blankRun := 0
-
- flushPara := func() {
- if len(pendingPara) > 0 {
- blocks = append(blocks, mdBlock{styleKey: "paragraph", text: strings.Join(pendingPara, "\n")})
- pendingPara = nil
- }
- }
-
- for _, raw := range lines {
- line := raw
- if strings.TrimSpace(line) == "" {
- if len(pendingPara) > 0 {
- flushPara()
- blankRun = 1
- continue
- }
- blankRun++
- continue
- }
- // Detect heading / list / blockquote markers BEFORE we accumulate
- // into the paragraph buffer.
- kind, payload, ok := detectBlockMarker(line)
- if ok {
- flushPara()
- // Emit spacing paragraphs equivalent to (blankRun - 1) extra.
- for i := 1; i < blankRun; i++ {
- blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
- }
- blankRun = 0
- blocks = append(blocks, mdBlock{styleKey: kind, text: payload})
- continue
- }
- // Plain paragraph line.
- if len(pendingPara) == 0 {
- // Starting a new paragraph after a blank run — emit
- // (blankRun-1) extra empty paragraphs for vertical spacing.
- for i := 1; i < blankRun; i++ {
- blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
- }
- }
- blankRun = 0
- pendingPara = append(pendingPara, line)
- }
- flushPara()
- return blocks
-}
-
-// detectBlockMarker classifies a single line. Returns (styleKey,
-// payload-with-marker-stripped, true) for recognised markers; false
-// for plain paragraph lines.
-//
-// Recognised markers (Slice D):
-// # Heading → heading_1
-// ## Heading → heading_2
-// ### Heading → heading_3
-// - item / * item → list_bullet
-// 1. item / 2. item ... → list_numbered (any positive integer)
-// > quote → blockquote
-//
-// Leading whitespace inside the line is tolerated up to 3 spaces (per
-// CommonMark) so the lawyer's contentEditable indentation doesn't
-// hide the marker.
-func detectBlockMarker(line string) (string, string, bool) {
- trimmed := strings.TrimLeft(line, " ")
- // Cap to 3 spaces of leading indent — beyond that, treat as a
- // regular paragraph line (matches CommonMark).
- if len(line)-len(trimmed) > 3 {
- return "", "", false
- }
- if strings.HasPrefix(trimmed, "### ") {
- return "heading_3", strings.TrimSpace(trimmed[4:]), true
- }
- if strings.HasPrefix(trimmed, "## ") {
- return "heading_2", strings.TrimSpace(trimmed[3:]), true
- }
- if strings.HasPrefix(trimmed, "# ") {
- return "heading_1", strings.TrimSpace(trimmed[2:]), true
- }
- if strings.HasPrefix(trimmed, "> ") {
- return "blockquote", strings.TrimSpace(trimmed[2:]), true
- }
- if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") {
- return "list_bullet", strings.TrimSpace(trimmed[2:]), true
- }
- // Numbered: "N. " where N is one or more digits.
- if i := indexOfNumberedMarker(trimmed); i > 0 {
- return "list_numbered", strings.TrimSpace(trimmed[i:]), true
- }
- return "", "", false
-}
-
-// indexOfNumberedMarker checks for "N. " or "N) " at the start of the
-// trimmed line; returns the byte index just past the marker, or -1 if
-// no marker present.
-func indexOfNumberedMarker(s string) int {
- i := 0
- for i < len(s) && s[i] >= '0' && s[i] <= '9' {
- i++
- }
- if i == 0 {
- return -1
- }
- if i >= len(s) {
- return -1
- }
- if s[i] != '.' && s[i] != ')' {
- return -1
- }
- if i+1 >= len(s) || s[i+1] != ' ' {
- return -1
- }
- return i + 2
-}
-
-// renderBlockParagraph emits one `` for a block. List blocks
-// keep the same paragraph style as a default paragraph (the Slice D
-// design's contract — list styles come from the base's stylemap and
-// Word's numbering.xml is honoured by adding a leading bullet/number
-// prefix in the rendered text). This keeps the composer free of
-// numbering.xml mutations.
-func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
+// renderBlock emits one for a block. List blocks get a visible
+// "• " / "N. " prefix run (the base stylemap handles indentation if it
+// defines a list style; the prefix at least surfaces the structure).
+func renderBlock(blk docforge.Block, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
var b strings.Builder
b.WriteString(``)
if paragraphStyle != "" {
@@ -251,110 +80,61 @@ func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAll
b.WriteString(xmlAttrEscape(paragraphStyle))
b.WriteString(`"/>`)
}
- if blk.text == "" {
- b.WriteString(``)
- b.WriteString(``)
+ // An empty block is an intentional empty paragraph: one empty run.
+ if len(blk.Spans) == 0 {
+ b.WriteString(``)
return b.String()
}
- text := blk.text
- // List blocks emit a visible "• " / "N. " prefix run. The
- // stylemap entry handles paragraph indentation if the base
- // defines a list paragraph style; otherwise the prefix at least
- // surfaces the structure in plain Word. Lawyers who want Word's
- // auto-numbering reapply a list style post-export.
- switch blk.styleKey {
- case "list_bullet":
+ switch blk.Kind {
+ case docforge.KindListBullet:
b.WriteString(`• `)
- case "list_numbered":
+ case docforge.KindListNumbered:
ordinal := numberedOrdinal
if ordinal <= 0 {
ordinal = 1
}
b.WriteString(``)
- b.WriteString(fmt.Sprintf("%d. ", ordinal))
- b.WriteString(``)
+ b.WriteString(strconv.Itoa(ordinal))
+ b.WriteString(`. `)
}
- for _, run := range parseInlineRuns(text, links) {
- b.WriteString(run)
+ for _, span := range blk.Spans {
+ b.WriteString(renderInlineSpan(span, links))
}
b.WriteString(``)
return b.String()
}
-// parseInlineRuns extracts inline spans + hyperlink runs and serialises
-// each to OOXML. Hyperlinks become `…runs…`
-// where RID comes from the HyperlinkAllocator.
-func parseInlineRuns(text string, links HyperlinkAllocator) []string {
- // Phase 1: find all hyperlink spans `[label](url)` and split the
- // text around them.
- type segment struct {
- text string
- isLink bool
- url string
- }
- var segs []segment
- rest := text
- for {
- idx := strings.Index(rest, "[")
- if idx < 0 {
- if rest != "" {
- segs = append(segs, segment{text: rest})
- }
- break
- }
- // Find matching closing bracket, then a "(" right after.
- closeBracket := strings.Index(rest[idx:], "](")
- if closeBracket < 0 {
- segs = append(segs, segment{text: rest})
- break
- }
- closeParen := strings.Index(rest[idx+closeBracket:], ")")
- if closeParen < 0 {
- segs = append(segs, segment{text: rest})
- break
- }
- // idx = start of "["
- // idx+closeBracket = position of "]"
- // idx+closeBracket+1 = position of "("
- // idx+closeBracket+closeParen = position of ")"
- label := rest[idx+1 : idx+closeBracket]
- url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
- if idx > 0 {
- segs = append(segs, segment{text: rest[:idx]})
- }
- segs = append(segs, segment{text: label, isLink: true, url: url})
- rest = rest[idx+closeBracket+closeParen+1:]
- }
-
- var runs []string
- for _, seg := range segs {
- if seg.isLink && links != nil {
- rid := links(seg.url)
- if rid != "" {
+// renderInlineSpan emits one span. A hyperlink span (Link != "") becomes a
+// wrapping its children when an allocator yields a
+// rId; otherwise the label children render as plain runs (URL dropped).
+func renderInlineSpan(span docforge.InlineSpan, links HyperlinkAllocator) string {
+ if span.Link != "" {
+ if links != nil {
+ if rid := links(span.Link); rid != "" {
var hb strings.Builder
hb.WriteString(``)
- for _, span := range parseInlineSpans(seg.text) {
- hb.WriteString(renderRunWithLinkStyle(span))
+ for _, child := range span.Children {
+ hb.WriteString(renderRunWithLinkStyle(child))
}
hb.WriteString(``)
- runs = append(runs, hb.String())
- continue
+ return hb.String()
}
}
- for _, span := range parseInlineSpans(seg.text) {
- runs = append(runs, renderRun(span))
+ // No allocator / no rId — render the label as plain runs.
+ var fb strings.Builder
+ for _, child := range span.Children {
+ fb.WriteString(renderRun(child))
}
+ return fb.String()
}
- return runs
+ return renderRun(span)
}
-// renderRunWithLinkStyle emits a hyperlink child run. Same B/I support
-// as renderRun, but additionally tags the run with the "Hyperlink"
-// character style (Word's built-in) so the link renders in the
-// document's hyperlink colour + underline.
-func renderRunWithLinkStyle(span inlineSpan) string {
+// renderRunWithLinkStyle emits a hyperlink child run with Word's built-in
+// "Hyperlink" character style (colour + underline), plus B/I.
+func renderRunWithLinkStyle(span docforge.InlineSpan) string {
var b strings.Builder
b.WriteString(``)
if span.Bold {
@@ -369,85 +149,8 @@ func renderRunWithLinkStyle(span inlineSpan) string {
return b.String()
}
-// inlineSpan is one piece of inline content: a text payload plus
-// formatting flags. Bold and italic are independent — `***both***`
-// produces one span with both flags set.
-type inlineSpan struct {
- Text string
- Bold bool
- Italic bool
-}
-
-// parseInlineSpans tokenises Markdown inline formatting into runs of
-// (text, bold, italic). The grammar is intentionally narrow:
-//
-// - `**…**` → bold
-// - `__…__` → bold (Markdown alternate)
-// - `*…*` → italic
-// - `_…_` → italic (Markdown alternate)
-// - Anything else flows through as plain text.
-//
-// Unbalanced delimiters fall through as literal characters — the
-// walker never errors on malformed Markdown. Nested formatting (e.g.
-// `**bold *bold-italic* bold**`) toggles flags as it walks.
-func parseInlineSpans(text string) []inlineSpan {
- var out []inlineSpan
- var cur strings.Builder
- bold := false
- italic := false
- flush := func() {
- if cur.Len() == 0 {
- return
- }
- out = append(out, inlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
- cur.Reset()
- }
- i := 0
- n := len(text)
- for i < n {
- // Preserve {{...}} placeholders verbatim. Underscores and
- // other Markdown-significant chars inside a placeholder key
- // (e.g. {{project.case_number}}) must not be interpreted as
- // bold/italic delimiters — otherwise the key gets stripped of
- // its underscores and the v1 placeholder pass looks up the
- // wrong key, surfacing [KEIN WERT: project.casenumber] in the
- // preview.
- if i+1 < n && text[i] == '{' && text[i+1] == '{' {
- rel := strings.Index(text[i+2:], "}}")
- if rel >= 0 {
- end := i + 2 + rel + 2
- cur.WriteString(text[i:end])
- i = end
- continue
- }
- // Unmatched {{ — fall through to plain character handling.
- }
- // Bold delimiters first (longer match wins over italic).
- if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
- flush()
- bold = !bold
- i += 2
- continue
- }
- if text[i] == '*' || text[i] == '_' {
- flush()
- italic = !italic
- i++
- continue
- }
- cur.WriteByte(text[i])
- i++
- }
- flush()
- if len(out) == 0 {
- out = append(out, inlineSpan{Text: ""})
- }
- return out
-}
-
-// renderRun emits one `` element for an inline span. Empty text
-// spans render as empty runs (Word accepts them; they're harmless).
-func renderRun(span inlineSpan) string {
+// renderRun emits one for a plain (text/bold/italic) span.
+func renderRun(span docforge.InlineSpan) string {
var b strings.Builder
b.WriteString(``)
if span.Bold || span.Italic {
@@ -466,34 +169,16 @@ func renderRun(span inlineSpan) string {
return b.String()
}
-// emptyParagraph returns one empty `` with the given style. Used
-// when a section's content_md is empty so the splice site stays
-// well-formed.
-func emptyParagraph(paragraphStyle string) string {
- var b strings.Builder
- b.WriteString(``)
- if paragraphStyle != "" {
- b.WriteString(``)
- }
- b.WriteString(``)
- return b.String()
-}
-
-// xmlTextEscape escapes the five XML-significant characters for safe
-// insertion into content. & first to avoid double-encoding.
+// xmlTextEscape escapes the XML-significant characters for content.
+// Quotes/apostrophes are legal in element text — not escaped.
func xmlTextEscape(s string) string {
s = strings.ReplaceAll(s, "&", "&")
s = strings.ReplaceAll(s, "<", "<")
s = strings.ReplaceAll(s, ">", ">")
- // Quotes and apostrophes are legal inside element text content;
- // no need to escape them here.
return s
}
-// xmlAttrEscape escapes for safe insertion into an attribute value
-// (e.g. ``).
+// xmlAttrEscape escapes for an attribute value (e.g. ).
func xmlAttrEscape(s string) string {
s = strings.ReplaceAll(s, "&", "&")
s = strings.ReplaceAll(s, "<", "<")
diff --git a/pkg/docforge/docx/markdown_test.go b/pkg/docforge/docx/markdown_test.go
index 3212c23..b7c6af2 100644
--- a/pkg/docforge/docx/markdown_test.go
+++ b/pkg/docforge/docx/markdown_test.go
@@ -112,46 +112,6 @@ func TestRenderMarkdownToOOXML_PlaceholderUnderscoresPreserved(t *testing.T) {
}
}
-func TestParseInlineSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) {
- // Direct guard on the inline scanner. {{project.case_number}} must
- // emit as a single non-italic span containing the full placeholder.
- spans := parseInlineSpans("{{project.case_number}}")
- if len(spans) != 1 {
- t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans)
- }
- if spans[0].Italic || spans[0].Bold {
- t.Errorf("placeholder must not be italic/bold; got %+v", spans[0])
- }
- if spans[0].Text != "{{project.case_number}}" {
- t.Errorf("placeholder text corrupted: got %q", spans[0].Text)
- }
-}
-
-func TestParseInlineSpans_ItalicAroundPlaceholder(t *testing.T) {
- // Italic delimiters outside a placeholder still work; the placeholder
- // itself stays literal even when it sits between italics.
- spans := parseInlineSpans("_before_ {{x.y_z}} _after_")
- var saw struct {
- italicBefore bool
- placeholder bool
- italicAfter bool
- }
- for _, s := range spans {
- if s.Italic && s.Text == "before" {
- saw.italicBefore = true
- }
- if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") {
- saw.placeholder = true
- }
- if s.Italic && s.Text == "after" {
- saw.italicAfter = true
- }
- }
- if !saw.italicBefore || !saw.placeholder || !saw.italicAfter {
- t.Errorf("expected italic/placeholder/italic structure; got %+v", spans)
- }
-}
-
// extractPlaceholders pulls every {{...}} occurrence out of a Markdown
// source. Tiny helper, only used by the regression test above.
func extractPlaceholders(s string) []string {
@@ -196,39 +156,6 @@ func TestRenderMarkdownToOOXML_CRLFNormalisation(t *testing.T) {
}
}
-func TestParseInlineSpans_Plain(t *testing.T) {
- spans := parseInlineSpans("hello world")
- if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" {
- t.Errorf("expected single plain span; got %+v", spans)
- }
-}
-
-func TestParseInlineSpans_UnderscoreItalic(t *testing.T) {
- spans := parseInlineSpans("_emph_")
- var italicHits int
- for _, s := range spans {
- if s.Italic && s.Text == "emph" {
- italicHits++
- }
- }
- if italicHits != 1 {
- t.Errorf("expected one italic 'emph' span; got %+v", spans)
- }
-}
-
-func TestParseInlineSpans_UnderscoreBold(t *testing.T) {
- spans := parseInlineSpans("__strong__")
- var boldHits int
- for _, s := range spans {
- if s.Bold && s.Text == "strong" {
- boldHits++
- }
- }
- if boldHits != 1 {
- t.Errorf("expected one bold 'strong' span; got %+v", spans)
- }
-}
-
// ─────────────────────────────────────────────────────────────────────
// Slice D — rich-prose constructs
// ─────────────────────────────────────────────────────────────────────
@@ -349,35 +276,3 @@ func TestRenderMarkdownToOOXML_HyperlinkNilAllocatorFallsBackToPlain(t *testing.
t.Errorf("hyperlink emitted without allocator: %q", out)
}
}
-
-func TestDetectBlockMarker(t *testing.T) {
- cases := []struct {
- in string
- kind string
- want string
- ok bool
- }{
- {"# A", "heading_1", "A", true},
- {"## B", "heading_2", "B", true},
- {"### C", "heading_3", "C", true},
- {" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated
- {" # too-deep", "", "", false}, // 4 spaces → not a heading
- {"- bullet", "list_bullet", "bullet", true},
- {"* star", "list_bullet", "star", true},
- {"1. one", "list_numbered", "one", true},
- {"42. forty-two", "list_numbered", "forty-two", true},
- {"1) paren", "list_numbered", "paren", true},
- {"1.no-space", "", "", false}, // ordinal needs trailing space
- {"> quote", "blockquote", "quote", true},
- {"plain", "", "", false},
- {"#nospace", "", "", false}, // heading needs space after hash
- }
- for _, tc := range cases {
- t.Run(tc.in, func(t *testing.T) {
- kind, payload, ok := detectBlockMarker(tc.in)
- if ok != tc.ok || kind != tc.kind || payload != tc.want {
- t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok)
- }
- })
- }
-}
diff --git a/pkg/docforge/exporter.go b/pkg/docforge/exporter.go
new file mode 100644
index 0000000..fd09030
--- /dev/null
+++ b/pkg/docforge/exporter.go
@@ -0,0 +1,22 @@
+package docforge
+
+// Exporter renders a neutral Document into a target format's body markup.
+// docforge owns the interface; each format adapter implements it (the
+// .docx adapter in pkg/docforge/docx today; .pdf/.html/.md are future
+// siblings — PRD §4 B4: interface now, docx-only impl). Format-specific
+// configuration (a stylemap, a hyperlink allocator for .docx) is baked into
+// the concrete exporter at construction, so the interface stays
+// format-neutral.
+//
+// "Body markup" is the renderable content fragment, not a complete file —
+// for .docx it is the OOXML run the composer splices into a carrier.
+// Container concerns (MIME type, packaging) are described by Format /
+// MIMEType and handled by the assembling layer.
+type Exporter interface {
+ // Format is the short format id, e.g. "docx".
+ Format() string
+ // MIMEType is the container MIME type the assembled document carries.
+ MIMEType() string
+ // RenderBody renders the document to the format's body markup.
+ RenderBody(doc Document) ([]byte, error)
+}
diff --git a/pkg/docforge/markdown/importer.go b/pkg/docforge/markdown/importer.go
new file mode 100644
index 0000000..a97781d
--- /dev/null
+++ b/pkg/docforge/markdown/importer.go
@@ -0,0 +1,230 @@
+// Package markdown imports Markdown source into the neutral
+// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary
+// input format). It is the single Markdown parser for docforge: the .docx
+// renderer consumes the Document this produces, so block-splitting and
+// inline tokenisation live here, not in the format adapter.
+//
+// Grammar (intentionally narrow — unrecognised syntax flows through as a
+// plain paragraph, so lawyer prose never errors):
+//
+// blank line → paragraph break
+// # / ## / ### Heading → heading_1 / 2 / 3
+// - item / * item → bullet list item
+// N. item / N) item → numbered list item
+// > quote → blockquote
+// **x** / __x__ → bold
+// *x* / _x_ → italic
+// [label](url) → hyperlink
+// {{key}} → preserved verbatim (substituted downstream)
+package markdown
+
+import (
+ "strings"
+
+ "mgit.msbls.de/m/paliad/pkg/docforge"
+)
+
+// Import parses Markdown into a Document. Empty (or all-blank) input yields
+// a single empty paragraph so a splice site stays well-formed.
+func Import(md string) docforge.Document {
+ blocks := splitBlocks(md)
+ if len(blocks) == 0 {
+ return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}}
+ }
+ out := make([]docforge.Block, 0, len(blocks))
+ for _, blk := range blocks {
+ b := docforge.Block{Kind: docforge.BlockKind(blk.kind)}
+ // An empty-text block is an intentional empty paragraph: leave
+ // Spans nil so the exporter emits a single empty run.
+ if blk.text != "" {
+ b.Spans = parseInline(blk.text)
+ }
+ out = append(out, b)
+ }
+ return docforge.Document{Blocks: out}
+}
+
+// rawBlock is the intermediate (kind, stripped-text) form before inline
+// parsing. kind values match docforge.BlockKind string values.
+type rawBlock struct {
+ kind string
+ text string
+}
+
+// splitBlocks parses the source into a sequence of (kind, text) blocks,
+// detecting heading / list / blockquote prefixes line-by-line. A run of
+// unmarked lines collapses into one paragraph block (soft line breaks
+// inside a paragraph concatenate); each marked line is its own block.
+// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised.
+func splitBlocks(md string) []rawBlock {
+ normalised := strings.ReplaceAll(md, "\r\n", "\n")
+ lines := strings.Split(normalised, "\n")
+ var blocks []rawBlock
+ var pendingPara []string
+ blankRun := 0
+
+ flushPara := func() {
+ if len(pendingPara) > 0 {
+ blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")})
+ pendingPara = nil
+ }
+ }
+
+ for _, line := range lines {
+ if strings.TrimSpace(line) == "" {
+ if len(pendingPara) > 0 {
+ flushPara()
+ blankRun = 1
+ continue
+ }
+ blankRun++
+ continue
+ }
+ if kind, payload, ok := detectBlockMarker(line); ok {
+ flushPara()
+ for i := 1; i < blankRun; i++ {
+ blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
+ }
+ blankRun = 0
+ blocks = append(blocks, rawBlock{kind: kind, text: payload})
+ continue
+ }
+ if len(pendingPara) == 0 {
+ for i := 1; i < blankRun; i++ {
+ blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
+ }
+ }
+ blankRun = 0
+ pendingPara = append(pendingPara, line)
+ }
+ flushPara()
+ return blocks
+}
+
+// detectBlockMarker classifies a single line. Tolerates up to 3 leading
+// spaces (CommonMark) before treating the line as a plain paragraph.
+func detectBlockMarker(line string) (kind, payload string, ok bool) {
+ trimmed := strings.TrimLeft(line, " ")
+ if len(line)-len(trimmed) > 3 {
+ return "", "", false
+ }
+ switch {
+ case strings.HasPrefix(trimmed, "### "):
+ return "heading_3", strings.TrimSpace(trimmed[4:]), true
+ case strings.HasPrefix(trimmed, "## "):
+ return "heading_2", strings.TrimSpace(trimmed[3:]), true
+ case strings.HasPrefix(trimmed, "# "):
+ return "heading_1", strings.TrimSpace(trimmed[2:]), true
+ case strings.HasPrefix(trimmed, "> "):
+ return "blockquote", strings.TrimSpace(trimmed[2:]), true
+ case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "):
+ return "list_bullet", strings.TrimSpace(trimmed[2:]), true
+ }
+ if i := indexOfNumberedMarker(trimmed); i > 0 {
+ return "list_numbered", strings.TrimSpace(trimmed[i:]), true
+ }
+ return "", "", false
+}
+
+// indexOfNumberedMarker returns the byte index just past an "N. " / "N) "
+// marker at the start of s, or -1 when absent.
+func indexOfNumberedMarker(s string) int {
+ i := 0
+ for i < len(s) && s[i] >= '0' && s[i] <= '9' {
+ i++
+ }
+ if i == 0 || i >= len(s) {
+ return -1
+ }
+ if s[i] != '.' && s[i] != ')' {
+ return -1
+ }
+ if i+1 >= len(s) || s[i+1] != ' ' {
+ return -1
+ }
+ return i + 2
+}
+
+// parseInline splits text around [label](url) hyperlinks and tokenises the
+// rest into bold/italic spans. Hyperlinks become a span with Link set and
+// the label's spans as Children, preserving link boundaries.
+func parseInline(text string) []docforge.InlineSpan {
+ var out []docforge.InlineSpan
+ rest := text
+ for {
+ idx := strings.Index(rest, "[")
+ if idx < 0 {
+ if rest != "" {
+ out = append(out, parseSpans(rest)...)
+ }
+ break
+ }
+ closeBracket := strings.Index(rest[idx:], "](")
+ if closeBracket < 0 {
+ out = append(out, parseSpans(rest)...)
+ break
+ }
+ closeParen := strings.Index(rest[idx+closeBracket:], ")")
+ if closeParen < 0 {
+ out = append(out, parseSpans(rest)...)
+ break
+ }
+ label := rest[idx+1 : idx+closeBracket]
+ url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
+ if idx > 0 {
+ out = append(out, parseSpans(rest[:idx])...)
+ }
+ out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)})
+ rest = rest[idx+closeBracket+closeParen+1:]
+ }
+ return out
+}
+
+// parseSpans tokenises Markdown inline bold/italic into spans, preserving
+// {{...}} placeholders verbatim (the b78a984 fix — underscores in a
+// placeholder key must not be read as italic delimiters). Empty input
+// yields one empty span.
+func parseSpans(text string) []docforge.InlineSpan {
+ var out []docforge.InlineSpan
+ var cur strings.Builder
+ bold := false
+ italic := false
+ flush := func() {
+ if cur.Len() == 0 {
+ return
+ }
+ out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
+ cur.Reset()
+ }
+ i := 0
+ n := len(text)
+ for i < n {
+ if i+1 < n && text[i] == '{' && text[i+1] == '{' {
+ if rel := strings.Index(text[i+2:], "}}"); rel >= 0 {
+ end := i + 2 + rel + 2
+ cur.WriteString(text[i:end])
+ i = end
+ continue
+ }
+ }
+ if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
+ flush()
+ bold = !bold
+ i += 2
+ continue
+ }
+ if text[i] == '*' || text[i] == '_' {
+ flush()
+ italic = !italic
+ i++
+ continue
+ }
+ cur.WriteByte(text[i])
+ i++
+ }
+ flush()
+ if len(out) == 0 {
+ out = append(out, docforge.InlineSpan{Text: ""})
+ }
+ return out
+}
diff --git a/pkg/docforge/markdown/importer_test.go b/pkg/docforge/markdown/importer_test.go
new file mode 100644
index 0000000..372e168
--- /dev/null
+++ b/pkg/docforge/markdown/importer_test.go
@@ -0,0 +1,145 @@
+package markdown
+
+import (
+ "strings"
+ "testing"
+)
+
+// Inline-span + block-marker tests, relocated from the docx walker when
+// parsing moved here (t-paliad-349 slice 8). parseSpans is the inline
+// tokeniser; detectBlockMarker classifies a line.
+
+func TestParseSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) {
+ // {{project.case_number}} must emit as a single non-italic span
+ // containing the full placeholder (the b78a984 fix).
+ spans := parseSpans("{{project.case_number}}")
+ if len(spans) != 1 {
+ t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans)
+ }
+ if spans[0].Italic || spans[0].Bold {
+ t.Errorf("placeholder must not be italic/bold; got %+v", spans[0])
+ }
+ if spans[0].Text != "{{project.case_number}}" {
+ t.Errorf("placeholder text corrupted: got %q", spans[0].Text)
+ }
+}
+
+func TestParseSpans_ItalicAroundPlaceholder(t *testing.T) {
+ spans := parseSpans("_before_ {{x.y_z}} _after_")
+ var saw struct {
+ italicBefore bool
+ placeholder bool
+ italicAfter bool
+ }
+ for _, s := range spans {
+ if s.Italic && s.Text == "before" {
+ saw.italicBefore = true
+ }
+ if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") {
+ saw.placeholder = true
+ }
+ if s.Italic && s.Text == "after" {
+ saw.italicAfter = true
+ }
+ }
+ if !saw.italicBefore || !saw.placeholder || !saw.italicAfter {
+ t.Errorf("expected italic/placeholder/italic structure; got %+v", spans)
+ }
+}
+
+func TestParseSpans_Plain(t *testing.T) {
+ spans := parseSpans("hello world")
+ if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" {
+ t.Errorf("expected single plain span; got %+v", spans)
+ }
+}
+
+func TestParseSpans_UnderscoreItalic(t *testing.T) {
+ spans := parseSpans("_emph_")
+ var italicHits int
+ for _, s := range spans {
+ if s.Italic && s.Text == "emph" {
+ italicHits++
+ }
+ }
+ if italicHits != 1 {
+ t.Errorf("expected one italic 'emph' span; got %+v", spans)
+ }
+}
+
+func TestParseSpans_UnderscoreBold(t *testing.T) {
+ spans := parseSpans("__strong__")
+ var boldHits int
+ for _, s := range spans {
+ if s.Bold && s.Text == "strong" {
+ boldHits++
+ }
+ }
+ if boldHits != 1 {
+ t.Errorf("expected one bold 'strong' span; got %+v", spans)
+ }
+}
+
+func TestDetectBlockMarker(t *testing.T) {
+ cases := []struct {
+ in string
+ kind string
+ want string
+ ok bool
+ }{
+ {"# A", "heading_1", "A", true},
+ {"## B", "heading_2", "B", true},
+ {"### C", "heading_3", "C", true},
+ {" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated
+ {" # too-deep", "", "", false}, // 4 spaces → not a heading
+ {"- bullet", "list_bullet", "bullet", true},
+ {"* star", "list_bullet", "star", true},
+ {"1. one", "list_numbered", "one", true},
+ {"42. forty-two", "list_numbered", "forty-two", true},
+ {"1) paren", "list_numbered", "paren", true},
+ {"1.no-space", "", "", false}, // ordinal needs trailing space
+ {"> quote", "blockquote", "quote", true},
+ {"plain", "", "", false},
+ {"#nospace", "", "", false}, // heading needs space after hash
+ }
+ for _, tc := range cases {
+ t.Run(tc.in, func(t *testing.T) {
+ kind, payload, ok := detectBlockMarker(tc.in)
+ if ok != tc.ok || kind != tc.kind || payload != tc.want {
+ t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok)
+ }
+ })
+ }
+}
+
+// TestImport_Document spot-checks the neutral Document the importer
+// produces — block kinds, the link-span shape, and placeholder pass-through.
+func TestImport_Document(t *testing.T) {
+ doc := Import("# Title\n\nBody **bold** and [label](http://x).\n\n- item")
+ if len(doc.Blocks) != 3 {
+ t.Fatalf("blocks = %d; want 3 (%+v)", len(doc.Blocks), doc.Blocks)
+ }
+ if doc.Blocks[0].Kind != "heading_1" {
+ t.Errorf("block0 kind = %q; want heading_1", doc.Blocks[0].Kind)
+ }
+ if doc.Blocks[2].Kind != "list_bullet" {
+ t.Errorf("block2 kind = %q; want list_bullet", doc.Blocks[2].Kind)
+ }
+ // The body paragraph carries a link span with Link set + children.
+ var sawLink bool
+ for _, s := range doc.Blocks[1].Spans {
+ if s.Link == "http://x" && len(s.Children) > 0 {
+ sawLink = true
+ }
+ }
+ if !sawLink {
+ t.Errorf("body block missing link span; got %+v", doc.Blocks[1].Spans)
+ }
+}
+
+func TestImport_EmptyYieldsOneEmptyParagraph(t *testing.T) {
+ doc := Import("")
+ if len(doc.Blocks) != 1 || doc.Blocks[0].Kind != "paragraph" || len(doc.Blocks[0].Spans) != 0 {
+ t.Errorf("empty import = %+v; want one empty paragraph block", doc.Blocks)
+ }
+}
diff --git a/pkg/docforge/model.go b/pkg/docforge/model.go
new file mode 100644
index 0000000..edbffd3
--- /dev/null
+++ b/pkg/docforge/model.go
@@ -0,0 +1,58 @@
+package docforge
+
+// The neutral document model — the format-independent representation an
+// importer produces and an exporter consumes (PRD §3.2). A Markdown
+// importer parses source into a Document; the .docx exporter renders a
+// Document into OOXML; a future PDF/HTML exporter renders the same
+// Document differently. The model carries editable content only —
+// placeholders ({{key}}) ride through as literal span text and are
+// substituted later by the format exporter's merge pass, exactly as in
+// the pre-model pipeline.
+//
+// Slice 8 (t-paliad-349) lands this model with two real consumers: the
+// Markdown importer (pkg/docforge/markdown) and the .docx renderer
+// (pkg/docforge/docx), which the shipped submission walker now routes
+// through — so there is one parser, not two.
+
+// BlockKind is the logical kind of a block. Its string values are the
+// stylemap keys a format exporter looks up (paragraph, heading_1, …), so
+// the docx exporter maps Kind → Word paragraph style directly.
+type BlockKind string
+
+const (
+ KindParagraph BlockKind = "paragraph"
+ KindHeading1 BlockKind = "heading_1"
+ KindHeading2 BlockKind = "heading_2"
+ KindHeading3 BlockKind = "heading_3"
+ KindListBullet BlockKind = "list_bullet"
+ KindListNumbered BlockKind = "list_numbered"
+ KindBlockquote BlockKind = "blockquote"
+)
+
+// Document is a sequence of blocks — the whole editable content.
+type Document struct {
+ Blocks []Block
+}
+
+// Block is one paragraph-level unit. Spans is its inline content; an empty
+// Spans slice is an intentional empty paragraph (vertical spacing).
+type Block struct {
+ Kind BlockKind
+ Spans []InlineSpan
+}
+
+// InlineSpan is one run of inline content. A span is either:
+// - literal text with optional bold/italic (Link == "", Children nil), or
+// - a hyperlink (Link != "") whose label is the Children spans.
+//
+// Modelling a link as a span with Children (rather than a per-span Link
+// flag) preserves link boundaries: two adjacent links to the same URL stay
+// two distinct hyperlink spans, so the exporter emits them byte-identically
+// to the pre-model walker.
+type InlineSpan struct {
+ Text string
+ Bold bool
+ Italic bool
+ Link string // non-empty → this span is a hyperlink to Link
+ Children []InlineSpan // hyperlink label content (only when Link != "")
+}