diff --git a/pkg/docforge/docx/compose.go b/pkg/docforge/docx/compose.go index a343d3a..4beb704 100644 --- a/pkg/docforge/docx/compose.go +++ b/pkg/docforge/docx/compose.go @@ -240,7 +240,7 @@ var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`) // the body — from the start of the opening anchor's element // through the end of the closing anchor's . type anchorPair struct { - key string + key string openStart int // start of for the opening anchor closeEnd int // index just past for the closing anchor } @@ -251,10 +251,10 @@ type anchorPair struct { // span is non-overlapping. func findAllAnchorPairs(body string) []anchorPair { type marker struct { - key string + key string paraStart int paraEnd int - isOpen bool + isOpen bool } var markers []marker diff --git a/pkg/docforge/docx/exporter.go b/pkg/docforge/docx/exporter.go new file mode 100644 index 0000000..67c3c3c --- /dev/null +++ b/pkg/docforge/docx/exporter.go @@ -0,0 +1,39 @@ +package docx + +import "mgit.msbls.de/m/paliad/pkg/docforge" + +// Exporter is the .docx implementation of docforge.Exporter — it renders a +// neutral Document to OOXML body markup (t-paliad-349 slice 8). The +// stylemap (block kind → Word paragraph style) and the optional hyperlink +// allocator are baked in at construction, so RenderBody matches the +// interface's format-neutral signature. +// +// This is the seam a future PDF/HTML exporter slots into: implement +// docforge.Exporter, no engine change. The submission composer can render +// section content through this exporter instead of calling +// RenderDocumentToOOXML directly once a second format exists. +type Exporter struct { + Stylemap map[string]string + Links HyperlinkAllocator +} + +// compile-time conformance. +var _ docforge.Exporter = Exporter{} + +// NewExporter builds a .docx exporter with the given stylemap + allocator. +func NewExporter(stylemap map[string]string, links HyperlinkAllocator) Exporter { + return Exporter{Stylemap: stylemap, Links: links} +} + +// Format returns the format id. +func (Exporter) Format() string { return "docx" } + +// MIMEType returns the .docx container MIME type. +func (Exporter) MIMEType() string { + return "application/vnd.openxmlformats-officedocument.wordprocessingml.document" +} + +// RenderBody renders the Document to OOXML paragraph markup. +func (e Exporter) RenderBody(doc docforge.Document) ([]byte, error) { + return []byte(RenderDocumentToOOXML(doc, e.Stylemap, e.Links)), nil +} diff --git a/pkg/docforge/docx/exporter_test.go b/pkg/docforge/docx/exporter_test.go new file mode 100644 index 0000000..ad5a48e --- /dev/null +++ b/pkg/docforge/docx/exporter_test.go @@ -0,0 +1,34 @@ +package docx + +import ( + "strings" + "testing" + + "mgit.msbls.de/m/paliad/pkg/docforge" + "mgit.msbls.de/m/paliad/pkg/docforge/markdown" +) + +func TestExporter_RenderBodyMatchesWalker(t *testing.T) { + exp := NewExporter(map[string]string{"paragraph": "Body"}, nil) + if exp.Format() != "docx" { + t.Errorf("Format = %q; want docx", exp.Format()) + } + if !strings.Contains(exp.MIMEType(), "wordprocessingml.document") { + t.Errorf("MIMEType = %q", exp.MIMEType()) + } + + md := "Hello **world**\n\n- item" + // The Exporter must produce exactly what the walker entry point does + // for the same input (both go markdown.Import → RenderDocumentToOOXML). + body, err := exp.RenderBody(markdown.Import(md)) + if err != nil { + t.Fatalf("RenderBody: %v", err) + } + want := RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": "Body"}, nil) + if string(body) != want { + t.Errorf("RenderBody mismatch:\n got %q\nwant %q", body, want) + } +} + +// satisfies the interface (compile-time check mirrored at runtime). +var _ docforge.Exporter = Exporter{} diff --git a/pkg/docforge/docx/markdown.go b/pkg/docforge/docx/markdown.go index 812b01a..d2585ac 100644 --- a/pkg/docforge/docx/markdown.go +++ b/pkg/docforge/docx/markdown.go @@ -1,249 +1,78 @@ package docx -// Markdown → OOXML walker for Composer section content (t-paliad-313 -// Slice B, design doc §9.2). +// Markdown → OOXML rendering for Composer section content (t-paliad-313 +// Slice B/D; restructured in t-paliad-349 slice 8). // -// Scope per the head's Slice B brief: paragraphs + inline bold/italic -// only. Headings, lists, blockquote, links land in Slice D's rich-prose -// pass. This walker is intentionally minimal — every Markdown construct -// it doesn't recognise is rendered as a plain paragraph so the lawyer's -// prose round-trips losslessly even when they hit Markdown the walker -// doesn't yet understand. +// Parsing now lives in pkg/docforge/markdown, which produces the neutral +// docforge.Document. This file renders that Document into OOXML paragraph +// elements () ready to splice into a .docx body. There is one +// Markdown parser for docforge; this is the .docx exporter for its model. // -// The output uses the base's stylemap.paragraph entry for the -// on each paragraph so the styling matches the base's -// typography (HLpat-Body-B0 on the HLC base, Normal on the neutral -// base, etc.). -// -// Placeholders ({{path.dot.notation}}) are preserved verbatim — they -// pass through the walker untouched and get substituted by the v1 -// SubmissionRenderer's placeholder pass after the composer assembly. -// -// Grammar supported: -// -// - Blank line → paragraph break -// - `**bold**` → -// - `*italic*` or `_italic_` → -// - Otherwise → plain text run +// Output uses the base's stylemap entry for each block kind on the +// , so styling matches the base's typography (HLpat-Body-B0 on +// the HLC base, Normal on the neutral base, etc.). Placeholders ({{key}}) +// ride through as literal run text and are substituted by the placeholder +// pass after assembly. import ( - "fmt" + "strconv" "strings" + + "mgit.msbls.de/m/paliad/pkg/docforge" + "mgit.msbls.de/m/paliad/pkg/docforge/markdown" ) -// HyperlinkAllocator hands the walker a `rId` for each external URL -// it encounters in `[label](url)` inline links. The composer's -// post-pass uses these allocations to mutate -// `word/_rels/document.xml.rels` so the emitted `` elements resolve correctly. Pass nil to drop links to -// plain text (the label survives, the URL doesn't render). -// -// t-paliad-316 Slice D. +// HyperlinkAllocator hands the renderer a `rId` for each external URL it +// encounters in `[label](url)` inline links. The composer's post-pass uses +// these allocations to mutate `word/_rels/document.xml.rels` so the emitted +// `` elements resolve. Pass nil to drop links to +// plain text (the label survives, the URL doesn't render). t-paliad-316. type HyperlinkAllocator func(url string) string -// RenderMarkdownToOOXML renders the given Markdown source into OOXML -// paragraph elements (``), suitable for splicing into a -// .docx body. Each paragraph carries `` -// when paragraphStyle is non-empty. -// -// Slice B shipped paragraphs + bold/italic. Slice D extends to -// headings (h1/h2/h3), bullet/numbered lists, blockquote, and inline -// hyperlinks via the optional HyperlinkAllocator. -// -// stylemap supplies the paragraph-style names for each kind: -// stylemap["paragraph"] — default body -// stylemap["heading_1/2/3"] — heading levels -// stylemap["list_bullet"] — bullet list paragraph style -// stylemap["list_numbered"] — numbered list paragraph style -// stylemap["blockquote"] — blockquote -// Missing entries fall back to the "paragraph" style. -// -// Empty input renders one empty paragraph so the splice site is -// well-formed even when the lawyer hasn't typed anything in this -// section. +// RenderMarkdownToOOXML renders Markdown into OOXML paragraphs with a +// single paragraph style. Slice B back-compat wrapper. func RenderMarkdownToOOXML(md, paragraphStyle string) string { return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil) } -// RenderMarkdownToOOXMLWithStyles is the full Slice-D-aware entry -// point. Slice B's RenderMarkdownToOOXML is a wrapper for back-compat. +// RenderMarkdownToOOXMLWithStyles parses Markdown into a docforge.Document +// and renders it to OOXML. stylemap maps each block kind (paragraph, +// heading_1/2/3, list_bullet, list_numbered, blockquote) to a Word +// paragraph style; missing entries fall back to the "paragraph" style. func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string { + return RenderDocumentToOOXML(markdown.Import(md), stylemap, links) +} + +// RenderDocumentToOOXML renders a neutral Document to OOXML paragraphs — +// the .docx side of the docforge importer→model→exporter pipeline. Any +// Document (Markdown today, a foreign-doc importer later) renders the same +// way. +func RenderDocumentToOOXML(doc docforge.Document, stylemap map[string]string, links HyperlinkAllocator) string { defaultStyle := stylemap["paragraph"] - if md == "" { - return emptyParagraph(defaultStyle) - } - blocks := splitMarkdownBlocks(md) - if len(blocks) == 0 { - return emptyParagraph(defaultStyle) - } // Numbered-list counter resets on every non-numbered block so - // "1. A\n2. B\n\n1. C" renders as 1./2./1. (the lawyer's input - // determined the ordinal, the walker just renders). - numberedCounter := 0 + // "1. A\n2. B\n\n1. C" renders 1./2./1. — the input determined the + // ordinal, the renderer just emits it. + numbered := 0 var b strings.Builder - for _, blk := range blocks { - style := stylemap[blk.styleKey] + for _, blk := range doc.Blocks { + style := stylemap[string(blk.Kind)] if style == "" { style = defaultStyle } - if blk.styleKey == "list_numbered" { - numberedCounter++ + if blk.Kind == docforge.KindListNumbered { + numbered++ } else { - numberedCounter = 0 + numbered = 0 } - b.WriteString(renderBlockParagraph(blk, style, links, numberedCounter)) + b.WriteString(renderBlock(blk, style, links, numbered)) } return b.String() } -// mdBlock is one rendered paragraph: a kind (paragraph / heading_* -// / list_bullet / list_numbered / blockquote) and the inline content -// text. List markers, heading hashes, blockquote `> ` etc. are -// stripped from the text before storage. -type mdBlock struct { - styleKey string // "paragraph" | "heading_1" | "heading_2" | "heading_3" | "list_bullet" | "list_numbered" | "blockquote" - text string -} - -// splitMarkdownBlocks parses the source into a sequence of blocks, -// detecting heading / list / blockquote prefixes line-by-line. Blank -// lines split paragraph runs (same semantics as splitMarkdownParagraphs) -// but each line is also tagged with its block kind. -// -// Lines that look like block markers don't merge with their neighbours -// even across blank lines — every list / heading / blockquote line is -// its own block in the output. A run of unmarked lines collapses into -// one "paragraph" block (so soft line breaks inside a paragraph still -// concatenate). -// -// CRLF normalised to LF before parsing. -func splitMarkdownBlocks(md string) []mdBlock { - normalised := strings.ReplaceAll(md, "\r\n", "\n") - lines := strings.Split(normalised, "\n") - var blocks []mdBlock - var pendingPara []string - blankRun := 0 - - flushPara := func() { - if len(pendingPara) > 0 { - blocks = append(blocks, mdBlock{styleKey: "paragraph", text: strings.Join(pendingPara, "\n")}) - pendingPara = nil - } - } - - for _, raw := range lines { - line := raw - if strings.TrimSpace(line) == "" { - if len(pendingPara) > 0 { - flushPara() - blankRun = 1 - continue - } - blankRun++ - continue - } - // Detect heading / list / blockquote markers BEFORE we accumulate - // into the paragraph buffer. - kind, payload, ok := detectBlockMarker(line) - if ok { - flushPara() - // Emit spacing paragraphs equivalent to (blankRun - 1) extra. - for i := 1; i < blankRun; i++ { - blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""}) - } - blankRun = 0 - blocks = append(blocks, mdBlock{styleKey: kind, text: payload}) - continue - } - // Plain paragraph line. - if len(pendingPara) == 0 { - // Starting a new paragraph after a blank run — emit - // (blankRun-1) extra empty paragraphs for vertical spacing. - for i := 1; i < blankRun; i++ { - blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""}) - } - } - blankRun = 0 - pendingPara = append(pendingPara, line) - } - flushPara() - return blocks -} - -// detectBlockMarker classifies a single line. Returns (styleKey, -// payload-with-marker-stripped, true) for recognised markers; false -// for plain paragraph lines. -// -// Recognised markers (Slice D): -// # Heading → heading_1 -// ## Heading → heading_2 -// ### Heading → heading_3 -// - item / * item → list_bullet -// 1. item / 2. item ... → list_numbered (any positive integer) -// > quote → blockquote -// -// Leading whitespace inside the line is tolerated up to 3 spaces (per -// CommonMark) so the lawyer's contentEditable indentation doesn't -// hide the marker. -func detectBlockMarker(line string) (string, string, bool) { - trimmed := strings.TrimLeft(line, " ") - // Cap to 3 spaces of leading indent — beyond that, treat as a - // regular paragraph line (matches CommonMark). - if len(line)-len(trimmed) > 3 { - return "", "", false - } - if strings.HasPrefix(trimmed, "### ") { - return "heading_3", strings.TrimSpace(trimmed[4:]), true - } - if strings.HasPrefix(trimmed, "## ") { - return "heading_2", strings.TrimSpace(trimmed[3:]), true - } - if strings.HasPrefix(trimmed, "# ") { - return "heading_1", strings.TrimSpace(trimmed[2:]), true - } - if strings.HasPrefix(trimmed, "> ") { - return "blockquote", strings.TrimSpace(trimmed[2:]), true - } - if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") { - return "list_bullet", strings.TrimSpace(trimmed[2:]), true - } - // Numbered: "N. " where N is one or more digits. - if i := indexOfNumberedMarker(trimmed); i > 0 { - return "list_numbered", strings.TrimSpace(trimmed[i:]), true - } - return "", "", false -} - -// indexOfNumberedMarker checks for "N. " or "N) " at the start of the -// trimmed line; returns the byte index just past the marker, or -1 if -// no marker present. -func indexOfNumberedMarker(s string) int { - i := 0 - for i < len(s) && s[i] >= '0' && s[i] <= '9' { - i++ - } - if i == 0 { - return -1 - } - if i >= len(s) { - return -1 - } - if s[i] != '.' && s[i] != ')' { - return -1 - } - if i+1 >= len(s) || s[i+1] != ' ' { - return -1 - } - return i + 2 -} - -// renderBlockParagraph emits one `` for a block. List blocks -// keep the same paragraph style as a default paragraph (the Slice D -// design's contract — list styles come from the base's stylemap and -// Word's numbering.xml is honoured by adding a leading bullet/number -// prefix in the rendered text). This keeps the composer free of -// numbering.xml mutations. -func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string { +// renderBlock emits one for a block. List blocks get a visible +// "• " / "N. " prefix run (the base stylemap handles indentation if it +// defines a list style; the prefix at least surfaces the structure). +func renderBlock(blk docforge.Block, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string { var b strings.Builder b.WriteString(``) if paragraphStyle != "" { @@ -251,110 +80,61 @@ func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAll b.WriteString(xmlAttrEscape(paragraphStyle)) b.WriteString(`"/>`) } - if blk.text == "" { - b.WriteString(``) - b.WriteString(``) + // An empty block is an intentional empty paragraph: one empty run. + if len(blk.Spans) == 0 { + b.WriteString(``) return b.String() } - text := blk.text - // List blocks emit a visible "• " / "N. " prefix run. The - // stylemap entry handles paragraph indentation if the base - // defines a list paragraph style; otherwise the prefix at least - // surfaces the structure in plain Word. Lawyers who want Word's - // auto-numbering reapply a list style post-export. - switch blk.styleKey { - case "list_bullet": + switch blk.Kind { + case docforge.KindListBullet: b.WriteString(``) - case "list_numbered": + case docforge.KindListNumbered: ordinal := numberedOrdinal if ordinal <= 0 { ordinal = 1 } b.WriteString(``) - b.WriteString(fmt.Sprintf("%d. ", ordinal)) - b.WriteString(``) + b.WriteString(strconv.Itoa(ordinal)) + b.WriteString(`. `) } - for _, run := range parseInlineRuns(text, links) { - b.WriteString(run) + for _, span := range blk.Spans { + b.WriteString(renderInlineSpan(span, links)) } b.WriteString(``) return b.String() } -// parseInlineRuns extracts inline spans + hyperlink runs and serialises -// each to OOXML. Hyperlinks become `…runs…` -// where RID comes from the HyperlinkAllocator. -func parseInlineRuns(text string, links HyperlinkAllocator) []string { - // Phase 1: find all hyperlink spans `[label](url)` and split the - // text around them. - type segment struct { - text string - isLink bool - url string - } - var segs []segment - rest := text - for { - idx := strings.Index(rest, "[") - if idx < 0 { - if rest != "" { - segs = append(segs, segment{text: rest}) - } - break - } - // Find matching closing bracket, then a "(" right after. - closeBracket := strings.Index(rest[idx:], "](") - if closeBracket < 0 { - segs = append(segs, segment{text: rest}) - break - } - closeParen := strings.Index(rest[idx+closeBracket:], ")") - if closeParen < 0 { - segs = append(segs, segment{text: rest}) - break - } - // idx = start of "[" - // idx+closeBracket = position of "]" - // idx+closeBracket+1 = position of "(" - // idx+closeBracket+closeParen = position of ")" - label := rest[idx+1 : idx+closeBracket] - url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen] - if idx > 0 { - segs = append(segs, segment{text: rest[:idx]}) - } - segs = append(segs, segment{text: label, isLink: true, url: url}) - rest = rest[idx+closeBracket+closeParen+1:] - } - - var runs []string - for _, seg := range segs { - if seg.isLink && links != nil { - rid := links(seg.url) - if rid != "" { +// renderInlineSpan emits one span. A hyperlink span (Link != "") becomes a +// wrapping its children when an allocator yields a +// rId; otherwise the label children render as plain runs (URL dropped). +func renderInlineSpan(span docforge.InlineSpan, links HyperlinkAllocator) string { + if span.Link != "" { + if links != nil { + if rid := links(span.Link); rid != "" { var hb strings.Builder hb.WriteString(``) - for _, span := range parseInlineSpans(seg.text) { - hb.WriteString(renderRunWithLinkStyle(span)) + for _, child := range span.Children { + hb.WriteString(renderRunWithLinkStyle(child)) } hb.WriteString(``) - runs = append(runs, hb.String()) - continue + return hb.String() } } - for _, span := range parseInlineSpans(seg.text) { - runs = append(runs, renderRun(span)) + // No allocator / no rId — render the label as plain runs. + var fb strings.Builder + for _, child := range span.Children { + fb.WriteString(renderRun(child)) } + return fb.String() } - return runs + return renderRun(span) } -// renderRunWithLinkStyle emits a hyperlink child run. Same B/I support -// as renderRun, but additionally tags the run with the "Hyperlink" -// character style (Word's built-in) so the link renders in the -// document's hyperlink colour + underline. -func renderRunWithLinkStyle(span inlineSpan) string { +// renderRunWithLinkStyle emits a hyperlink child run with Word's built-in +// "Hyperlink" character style (colour + underline), plus B/I. +func renderRunWithLinkStyle(span docforge.InlineSpan) string { var b strings.Builder b.WriteString(``) if span.Bold { @@ -369,85 +149,8 @@ func renderRunWithLinkStyle(span inlineSpan) string { return b.String() } -// inlineSpan is one piece of inline content: a text payload plus -// formatting flags. Bold and italic are independent — `***both***` -// produces one span with both flags set. -type inlineSpan struct { - Text string - Bold bool - Italic bool -} - -// parseInlineSpans tokenises Markdown inline formatting into runs of -// (text, bold, italic). The grammar is intentionally narrow: -// -// - `**…**` → bold -// - `__…__` → bold (Markdown alternate) -// - `*…*` → italic -// - `_…_` → italic (Markdown alternate) -// - Anything else flows through as plain text. -// -// Unbalanced delimiters fall through as literal characters — the -// walker never errors on malformed Markdown. Nested formatting (e.g. -// `**bold *bold-italic* bold**`) toggles flags as it walks. -func parseInlineSpans(text string) []inlineSpan { - var out []inlineSpan - var cur strings.Builder - bold := false - italic := false - flush := func() { - if cur.Len() == 0 { - return - } - out = append(out, inlineSpan{Text: cur.String(), Bold: bold, Italic: italic}) - cur.Reset() - } - i := 0 - n := len(text) - for i < n { - // Preserve {{...}} placeholders verbatim. Underscores and - // other Markdown-significant chars inside a placeholder key - // (e.g. {{project.case_number}}) must not be interpreted as - // bold/italic delimiters — otherwise the key gets stripped of - // its underscores and the v1 placeholder pass looks up the - // wrong key, surfacing [KEIN WERT: project.casenumber] in the - // preview. - if i+1 < n && text[i] == '{' && text[i+1] == '{' { - rel := strings.Index(text[i+2:], "}}") - if rel >= 0 { - end := i + 2 + rel + 2 - cur.WriteString(text[i:end]) - i = end - continue - } - // Unmatched {{ — fall through to plain character handling. - } - // Bold delimiters first (longer match wins over italic). - if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") { - flush() - bold = !bold - i += 2 - continue - } - if text[i] == '*' || text[i] == '_' { - flush() - italic = !italic - i++ - continue - } - cur.WriteByte(text[i]) - i++ - } - flush() - if len(out) == 0 { - out = append(out, inlineSpan{Text: ""}) - } - return out -} - -// renderRun emits one `` element for an inline span. Empty text -// spans render as empty runs (Word accepts them; they're harmless). -func renderRun(span inlineSpan) string { +// renderRun emits one for a plain (text/bold/italic) span. +func renderRun(span docforge.InlineSpan) string { var b strings.Builder b.WriteString(``) if span.Bold || span.Italic { @@ -466,34 +169,16 @@ func renderRun(span inlineSpan) string { return b.String() } -// emptyParagraph returns one empty `` with the given style. Used -// when a section's content_md is empty so the splice site stays -// well-formed. -func emptyParagraph(paragraphStyle string) string { - var b strings.Builder - b.WriteString(``) - if paragraphStyle != "" { - b.WriteString(``) - } - b.WriteString(``) - return b.String() -} - -// xmlTextEscape escapes the five XML-significant characters for safe -// insertion into content. & first to avoid double-encoding. +// xmlTextEscape escapes the XML-significant characters for content. +// Quotes/apostrophes are legal in element text — not escaped. func xmlTextEscape(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") s = strings.ReplaceAll(s, ">", ">") - // Quotes and apostrophes are legal inside element text content; - // no need to escape them here. return s } -// xmlAttrEscape escapes for safe insertion into an attribute value -// (e.g. ``). +// xmlAttrEscape escapes for an attribute value (e.g. ). func xmlAttrEscape(s string) string { s = strings.ReplaceAll(s, "&", "&") s = strings.ReplaceAll(s, "<", "<") diff --git a/pkg/docforge/docx/markdown_test.go b/pkg/docforge/docx/markdown_test.go index 3212c23..b7c6af2 100644 --- a/pkg/docforge/docx/markdown_test.go +++ b/pkg/docforge/docx/markdown_test.go @@ -112,46 +112,6 @@ func TestRenderMarkdownToOOXML_PlaceholderUnderscoresPreserved(t *testing.T) { } } -func TestParseInlineSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) { - // Direct guard on the inline scanner. {{project.case_number}} must - // emit as a single non-italic span containing the full placeholder. - spans := parseInlineSpans("{{project.case_number}}") - if len(spans) != 1 { - t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans) - } - if spans[0].Italic || spans[0].Bold { - t.Errorf("placeholder must not be italic/bold; got %+v", spans[0]) - } - if spans[0].Text != "{{project.case_number}}" { - t.Errorf("placeholder text corrupted: got %q", spans[0].Text) - } -} - -func TestParseInlineSpans_ItalicAroundPlaceholder(t *testing.T) { - // Italic delimiters outside a placeholder still work; the placeholder - // itself stays literal even when it sits between italics. - spans := parseInlineSpans("_before_ {{x.y_z}} _after_") - var saw struct { - italicBefore bool - placeholder bool - italicAfter bool - } - for _, s := range spans { - if s.Italic && s.Text == "before" { - saw.italicBefore = true - } - if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") { - saw.placeholder = true - } - if s.Italic && s.Text == "after" { - saw.italicAfter = true - } - } - if !saw.italicBefore || !saw.placeholder || !saw.italicAfter { - t.Errorf("expected italic/placeholder/italic structure; got %+v", spans) - } -} - // extractPlaceholders pulls every {{...}} occurrence out of a Markdown // source. Tiny helper, only used by the regression test above. func extractPlaceholders(s string) []string { @@ -196,39 +156,6 @@ func TestRenderMarkdownToOOXML_CRLFNormalisation(t *testing.T) { } } -func TestParseInlineSpans_Plain(t *testing.T) { - spans := parseInlineSpans("hello world") - if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" { - t.Errorf("expected single plain span; got %+v", spans) - } -} - -func TestParseInlineSpans_UnderscoreItalic(t *testing.T) { - spans := parseInlineSpans("_emph_") - var italicHits int - for _, s := range spans { - if s.Italic && s.Text == "emph" { - italicHits++ - } - } - if italicHits != 1 { - t.Errorf("expected one italic 'emph' span; got %+v", spans) - } -} - -func TestParseInlineSpans_UnderscoreBold(t *testing.T) { - spans := parseInlineSpans("__strong__") - var boldHits int - for _, s := range spans { - if s.Bold && s.Text == "strong" { - boldHits++ - } - } - if boldHits != 1 { - t.Errorf("expected one bold 'strong' span; got %+v", spans) - } -} - // ───────────────────────────────────────────────────────────────────── // Slice D — rich-prose constructs // ───────────────────────────────────────────────────────────────────── @@ -349,35 +276,3 @@ func TestRenderMarkdownToOOXML_HyperlinkNilAllocatorFallsBackToPlain(t *testing. t.Errorf("hyperlink emitted without allocator: %q", out) } } - -func TestDetectBlockMarker(t *testing.T) { - cases := []struct { - in string - kind string - want string - ok bool - }{ - {"# A", "heading_1", "A", true}, - {"## B", "heading_2", "B", true}, - {"### C", "heading_3", "C", true}, - {" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated - {" # too-deep", "", "", false}, // 4 spaces → not a heading - {"- bullet", "list_bullet", "bullet", true}, - {"* star", "list_bullet", "star", true}, - {"1. one", "list_numbered", "one", true}, - {"42. forty-two", "list_numbered", "forty-two", true}, - {"1) paren", "list_numbered", "paren", true}, - {"1.no-space", "", "", false}, // ordinal needs trailing space - {"> quote", "blockquote", "quote", true}, - {"plain", "", "", false}, - {"#nospace", "", "", false}, // heading needs space after hash - } - for _, tc := range cases { - t.Run(tc.in, func(t *testing.T) { - kind, payload, ok := detectBlockMarker(tc.in) - if ok != tc.ok || kind != tc.kind || payload != tc.want { - t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok) - } - }) - } -} diff --git a/pkg/docforge/exporter.go b/pkg/docforge/exporter.go new file mode 100644 index 0000000..fd09030 --- /dev/null +++ b/pkg/docforge/exporter.go @@ -0,0 +1,22 @@ +package docforge + +// Exporter renders a neutral Document into a target format's body markup. +// docforge owns the interface; each format adapter implements it (the +// .docx adapter in pkg/docforge/docx today; .pdf/.html/.md are future +// siblings — PRD §4 B4: interface now, docx-only impl). Format-specific +// configuration (a stylemap, a hyperlink allocator for .docx) is baked into +// the concrete exporter at construction, so the interface stays +// format-neutral. +// +// "Body markup" is the renderable content fragment, not a complete file — +// for .docx it is the OOXML run the composer splices into a carrier. +// Container concerns (MIME type, packaging) are described by Format / +// MIMEType and handled by the assembling layer. +type Exporter interface { + // Format is the short format id, e.g. "docx". + Format() string + // MIMEType is the container MIME type the assembled document carries. + MIMEType() string + // RenderBody renders the document to the format's body markup. + RenderBody(doc Document) ([]byte, error) +} diff --git a/pkg/docforge/markdown/importer.go b/pkg/docforge/markdown/importer.go new file mode 100644 index 0000000..a97781d --- /dev/null +++ b/pkg/docforge/markdown/importer.go @@ -0,0 +1,230 @@ +// Package markdown imports Markdown source into the neutral +// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary +// input format). It is the single Markdown parser for docforge: the .docx +// renderer consumes the Document this produces, so block-splitting and +// inline tokenisation live here, not in the format adapter. +// +// Grammar (intentionally narrow — unrecognised syntax flows through as a +// plain paragraph, so lawyer prose never errors): +// +// blank line → paragraph break +// # / ## / ### Heading → heading_1 / 2 / 3 +// - item / * item → bullet list item +// N. item / N) item → numbered list item +// > quote → blockquote +// **x** / __x__ → bold +// *x* / _x_ → italic +// [label](url) → hyperlink +// {{key}} → preserved verbatim (substituted downstream) +package markdown + +import ( + "strings" + + "mgit.msbls.de/m/paliad/pkg/docforge" +) + +// Import parses Markdown into a Document. Empty (or all-blank) input yields +// a single empty paragraph so a splice site stays well-formed. +func Import(md string) docforge.Document { + blocks := splitBlocks(md) + if len(blocks) == 0 { + return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}} + } + out := make([]docforge.Block, 0, len(blocks)) + for _, blk := range blocks { + b := docforge.Block{Kind: docforge.BlockKind(blk.kind)} + // An empty-text block is an intentional empty paragraph: leave + // Spans nil so the exporter emits a single empty run. + if blk.text != "" { + b.Spans = parseInline(blk.text) + } + out = append(out, b) + } + return docforge.Document{Blocks: out} +} + +// rawBlock is the intermediate (kind, stripped-text) form before inline +// parsing. kind values match docforge.BlockKind string values. +type rawBlock struct { + kind string + text string +} + +// splitBlocks parses the source into a sequence of (kind, text) blocks, +// detecting heading / list / blockquote prefixes line-by-line. A run of +// unmarked lines collapses into one paragraph block (soft line breaks +// inside a paragraph concatenate); each marked line is its own block. +// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised. +func splitBlocks(md string) []rawBlock { + normalised := strings.ReplaceAll(md, "\r\n", "\n") + lines := strings.Split(normalised, "\n") + var blocks []rawBlock + var pendingPara []string + blankRun := 0 + + flushPara := func() { + if len(pendingPara) > 0 { + blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")}) + pendingPara = nil + } + } + + for _, line := range lines { + if strings.TrimSpace(line) == "" { + if len(pendingPara) > 0 { + flushPara() + blankRun = 1 + continue + } + blankRun++ + continue + } + if kind, payload, ok := detectBlockMarker(line); ok { + flushPara() + for i := 1; i < blankRun; i++ { + blocks = append(blocks, rawBlock{kind: "paragraph", text: ""}) + } + blankRun = 0 + blocks = append(blocks, rawBlock{kind: kind, text: payload}) + continue + } + if len(pendingPara) == 0 { + for i := 1; i < blankRun; i++ { + blocks = append(blocks, rawBlock{kind: "paragraph", text: ""}) + } + } + blankRun = 0 + pendingPara = append(pendingPara, line) + } + flushPara() + return blocks +} + +// detectBlockMarker classifies a single line. Tolerates up to 3 leading +// spaces (CommonMark) before treating the line as a plain paragraph. +func detectBlockMarker(line string) (kind, payload string, ok bool) { + trimmed := strings.TrimLeft(line, " ") + if len(line)-len(trimmed) > 3 { + return "", "", false + } + switch { + case strings.HasPrefix(trimmed, "### "): + return "heading_3", strings.TrimSpace(trimmed[4:]), true + case strings.HasPrefix(trimmed, "## "): + return "heading_2", strings.TrimSpace(trimmed[3:]), true + case strings.HasPrefix(trimmed, "# "): + return "heading_1", strings.TrimSpace(trimmed[2:]), true + case strings.HasPrefix(trimmed, "> "): + return "blockquote", strings.TrimSpace(trimmed[2:]), true + case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "): + return "list_bullet", strings.TrimSpace(trimmed[2:]), true + } + if i := indexOfNumberedMarker(trimmed); i > 0 { + return "list_numbered", strings.TrimSpace(trimmed[i:]), true + } + return "", "", false +} + +// indexOfNumberedMarker returns the byte index just past an "N. " / "N) " +// marker at the start of s, or -1 when absent. +func indexOfNumberedMarker(s string) int { + i := 0 + for i < len(s) && s[i] >= '0' && s[i] <= '9' { + i++ + } + if i == 0 || i >= len(s) { + return -1 + } + if s[i] != '.' && s[i] != ')' { + return -1 + } + if i+1 >= len(s) || s[i+1] != ' ' { + return -1 + } + return i + 2 +} + +// parseInline splits text around [label](url) hyperlinks and tokenises the +// rest into bold/italic spans. Hyperlinks become a span with Link set and +// the label's spans as Children, preserving link boundaries. +func parseInline(text string) []docforge.InlineSpan { + var out []docforge.InlineSpan + rest := text + for { + idx := strings.Index(rest, "[") + if idx < 0 { + if rest != "" { + out = append(out, parseSpans(rest)...) + } + break + } + closeBracket := strings.Index(rest[idx:], "](") + if closeBracket < 0 { + out = append(out, parseSpans(rest)...) + break + } + closeParen := strings.Index(rest[idx+closeBracket:], ")") + if closeParen < 0 { + out = append(out, parseSpans(rest)...) + break + } + label := rest[idx+1 : idx+closeBracket] + url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen] + if idx > 0 { + out = append(out, parseSpans(rest[:idx])...) + } + out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)}) + rest = rest[idx+closeBracket+closeParen+1:] + } + return out +} + +// parseSpans tokenises Markdown inline bold/italic into spans, preserving +// {{...}} placeholders verbatim (the b78a984 fix — underscores in a +// placeholder key must not be read as italic delimiters). Empty input +// yields one empty span. +func parseSpans(text string) []docforge.InlineSpan { + var out []docforge.InlineSpan + var cur strings.Builder + bold := false + italic := false + flush := func() { + if cur.Len() == 0 { + return + } + out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic}) + cur.Reset() + } + i := 0 + n := len(text) + for i < n { + if i+1 < n && text[i] == '{' && text[i+1] == '{' { + if rel := strings.Index(text[i+2:], "}}"); rel >= 0 { + end := i + 2 + rel + 2 + cur.WriteString(text[i:end]) + i = end + continue + } + } + if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") { + flush() + bold = !bold + i += 2 + continue + } + if text[i] == '*' || text[i] == '_' { + flush() + italic = !italic + i++ + continue + } + cur.WriteByte(text[i]) + i++ + } + flush() + if len(out) == 0 { + out = append(out, docforge.InlineSpan{Text: ""}) + } + return out +} diff --git a/pkg/docforge/markdown/importer_test.go b/pkg/docforge/markdown/importer_test.go new file mode 100644 index 0000000..372e168 --- /dev/null +++ b/pkg/docforge/markdown/importer_test.go @@ -0,0 +1,145 @@ +package markdown + +import ( + "strings" + "testing" +) + +// Inline-span + block-marker tests, relocated from the docx walker when +// parsing moved here (t-paliad-349 slice 8). parseSpans is the inline +// tokeniser; detectBlockMarker classifies a line. + +func TestParseSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) { + // {{project.case_number}} must emit as a single non-italic span + // containing the full placeholder (the b78a984 fix). + spans := parseSpans("{{project.case_number}}") + if len(spans) != 1 { + t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans) + } + if spans[0].Italic || spans[0].Bold { + t.Errorf("placeholder must not be italic/bold; got %+v", spans[0]) + } + if spans[0].Text != "{{project.case_number}}" { + t.Errorf("placeholder text corrupted: got %q", spans[0].Text) + } +} + +func TestParseSpans_ItalicAroundPlaceholder(t *testing.T) { + spans := parseSpans("_before_ {{x.y_z}} _after_") + var saw struct { + italicBefore bool + placeholder bool + italicAfter bool + } + for _, s := range spans { + if s.Italic && s.Text == "before" { + saw.italicBefore = true + } + if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") { + saw.placeholder = true + } + if s.Italic && s.Text == "after" { + saw.italicAfter = true + } + } + if !saw.italicBefore || !saw.placeholder || !saw.italicAfter { + t.Errorf("expected italic/placeholder/italic structure; got %+v", spans) + } +} + +func TestParseSpans_Plain(t *testing.T) { + spans := parseSpans("hello world") + if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" { + t.Errorf("expected single plain span; got %+v", spans) + } +} + +func TestParseSpans_UnderscoreItalic(t *testing.T) { + spans := parseSpans("_emph_") + var italicHits int + for _, s := range spans { + if s.Italic && s.Text == "emph" { + italicHits++ + } + } + if italicHits != 1 { + t.Errorf("expected one italic 'emph' span; got %+v", spans) + } +} + +func TestParseSpans_UnderscoreBold(t *testing.T) { + spans := parseSpans("__strong__") + var boldHits int + for _, s := range spans { + if s.Bold && s.Text == "strong" { + boldHits++ + } + } + if boldHits != 1 { + t.Errorf("expected one bold 'strong' span; got %+v", spans) + } +} + +func TestDetectBlockMarker(t *testing.T) { + cases := []struct { + in string + kind string + want string + ok bool + }{ + {"# A", "heading_1", "A", true}, + {"## B", "heading_2", "B", true}, + {"### C", "heading_3", "C", true}, + {" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated + {" # too-deep", "", "", false}, // 4 spaces → not a heading + {"- bullet", "list_bullet", "bullet", true}, + {"* star", "list_bullet", "star", true}, + {"1. one", "list_numbered", "one", true}, + {"42. forty-two", "list_numbered", "forty-two", true}, + {"1) paren", "list_numbered", "paren", true}, + {"1.no-space", "", "", false}, // ordinal needs trailing space + {"> quote", "blockquote", "quote", true}, + {"plain", "", "", false}, + {"#nospace", "", "", false}, // heading needs space after hash + } + for _, tc := range cases { + t.Run(tc.in, func(t *testing.T) { + kind, payload, ok := detectBlockMarker(tc.in) + if ok != tc.ok || kind != tc.kind || payload != tc.want { + t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok) + } + }) + } +} + +// TestImport_Document spot-checks the neutral Document the importer +// produces — block kinds, the link-span shape, and placeholder pass-through. +func TestImport_Document(t *testing.T) { + doc := Import("# Title\n\nBody **bold** and [label](http://x).\n\n- item") + if len(doc.Blocks) != 3 { + t.Fatalf("blocks = %d; want 3 (%+v)", len(doc.Blocks), doc.Blocks) + } + if doc.Blocks[0].Kind != "heading_1" { + t.Errorf("block0 kind = %q; want heading_1", doc.Blocks[0].Kind) + } + if doc.Blocks[2].Kind != "list_bullet" { + t.Errorf("block2 kind = %q; want list_bullet", doc.Blocks[2].Kind) + } + // The body paragraph carries a link span with Link set + children. + var sawLink bool + for _, s := range doc.Blocks[1].Spans { + if s.Link == "http://x" && len(s.Children) > 0 { + sawLink = true + } + } + if !sawLink { + t.Errorf("body block missing link span; got %+v", doc.Blocks[1].Spans) + } +} + +func TestImport_EmptyYieldsOneEmptyParagraph(t *testing.T) { + doc := Import("") + if len(doc.Blocks) != 1 || doc.Blocks[0].Kind != "paragraph" || len(doc.Blocks[0].Spans) != 0 { + t.Errorf("empty import = %+v; want one empty paragraph block", doc.Blocks) + } +} diff --git a/pkg/docforge/model.go b/pkg/docforge/model.go new file mode 100644 index 0000000..edbffd3 --- /dev/null +++ b/pkg/docforge/model.go @@ -0,0 +1,58 @@ +package docforge + +// The neutral document model — the format-independent representation an +// importer produces and an exporter consumes (PRD §3.2). A Markdown +// importer parses source into a Document; the .docx exporter renders a +// Document into OOXML; a future PDF/HTML exporter renders the same +// Document differently. The model carries editable content only — +// placeholders ({{key}}) ride through as literal span text and are +// substituted later by the format exporter's merge pass, exactly as in +// the pre-model pipeline. +// +// Slice 8 (t-paliad-349) lands this model with two real consumers: the +// Markdown importer (pkg/docforge/markdown) and the .docx renderer +// (pkg/docforge/docx), which the shipped submission walker now routes +// through — so there is one parser, not two. + +// BlockKind is the logical kind of a block. Its string values are the +// stylemap keys a format exporter looks up (paragraph, heading_1, …), so +// the docx exporter maps Kind → Word paragraph style directly. +type BlockKind string + +const ( + KindParagraph BlockKind = "paragraph" + KindHeading1 BlockKind = "heading_1" + KindHeading2 BlockKind = "heading_2" + KindHeading3 BlockKind = "heading_3" + KindListBullet BlockKind = "list_bullet" + KindListNumbered BlockKind = "list_numbered" + KindBlockquote BlockKind = "blockquote" +) + +// Document is a sequence of blocks — the whole editable content. +type Document struct { + Blocks []Block +} + +// Block is one paragraph-level unit. Spans is its inline content; an empty +// Spans slice is an intentional empty paragraph (vertical spacing). +type Block struct { + Kind BlockKind + Spans []InlineSpan +} + +// InlineSpan is one run of inline content. A span is either: +// - literal text with optional bold/italic (Link == "", Children nil), or +// - a hyperlink (Link != "") whose label is the Children spans. +// +// Modelling a link as a span with Children (rather than a per-span Link +// flag) preserves link boundaries: two adjacent links to the same URL stay +// two distinct hyperlink spans, so the exporter emits them byte-identically +// to the pre-model walker. +type InlineSpan struct { + Text string + Bold bool + Italic bool + Link string // non-empty → this span is a hyperlink to Link + Children []InlineSpan // hyperlink label content (only when Link != "") +}