feat(docforge): slice 8 — neutral model + Markdown importer + Exporter iface (t-paliad-349)
The final slice: land the format-neutral document model with REAL consumers
and unify the Markdown parser — no duplication, byte-identical output.
Neutral model (pkg/docforge/model.go): Document / Block / InlineSpan.
BlockKind values are the stylemap keys. A hyperlink is a span with Link set
+ Children (the label's spans), preserving link boundaries so adjacent
same-URL links stay distinct — byte-exact with the pre-model walker.
Markdown importer (pkg/docforge/markdown): Import(md) → Document. The SINGLE
Markdown parser for docforge — block split, marker detection, inline
bold/italic/link tokenisation, {{placeholder}} pass-through (the b78a984
fix). Relocated out of the docx walker.
docx renderer (pkg/docforge/docx/markdown.go): now RENDERS a Document →
OOXML (RenderDocumentToOOXML); RenderMarkdownToOOXML[WithStyles] = render(
markdown.Import(md)). The shipped submission walker routes through the model,
so there is one parser, not two. The comprehensive byte-exact render tests
(RenderMarkdownToOOXML_*) all PASS unchanged = output identical.
Exporter interface (pkg/docforge/exporter.go, PRD §4 B4): Exporter{Format,
MIMEType, RenderBody(Document)} with the .docx impl (pkg/docforge/docx/
exporter.go). The seam a future PDF/HTML exporter slots into.
Tests: parser tests relocated to the markdown pkg (parseSpans/detectBlockMarker)
+ new importer Document tests + exporter conformance test.
Verification: go build/vet clean; gofmt clean; full NO-DB test suite GREEN
(authoritative — proves no regression); docforge byte-exact render oracle
PASS; composer live test renders through the rewired walker (PASS); bun build
+ bun test 274/274. The shared-DB live run fails ~85 tests across unrelated
services from a harness pq-42P08 $1-type seeding quirk + a stale
deadline_rules test — systemic/environmental (the no-DB run is clean), not
this change.
docforge train complete: 8 slices, the engine extracted + cleaned + a working
author→generate→export loop on uploaded templates, plus the neutral model +
importer + exporter seam for future formats/consumers.
m/paliad#157
This commit is contained in:
@@ -240,7 +240,7 @@ var anchorKeyRegex = regexp.MustCompile(`^[A-Za-z0-9_]+$`)
|
|||||||
// the body — from the start of the opening anchor's <w:p> element
|
// the body — from the start of the opening anchor's <w:p> element
|
||||||
// through the end of the closing anchor's </w:p>.
|
// through the end of the closing anchor's </w:p>.
|
||||||
type anchorPair struct {
|
type anchorPair struct {
|
||||||
key string
|
key string
|
||||||
openStart int // start of <w:p> for the opening anchor
|
openStart int // start of <w:p> for the opening anchor
|
||||||
closeEnd int // index just past </w:p> for the closing anchor
|
closeEnd int // index just past </w:p> for the closing anchor
|
||||||
}
|
}
|
||||||
@@ -251,10 +251,10 @@ type anchorPair struct {
|
|||||||
// span is non-overlapping.
|
// span is non-overlapping.
|
||||||
func findAllAnchorPairs(body string) []anchorPair {
|
func findAllAnchorPairs(body string) []anchorPair {
|
||||||
type marker struct {
|
type marker struct {
|
||||||
key string
|
key string
|
||||||
paraStart int
|
paraStart int
|
||||||
paraEnd int
|
paraEnd int
|
||||||
isOpen bool
|
isOpen bool
|
||||||
}
|
}
|
||||||
var markers []marker
|
var markers []marker
|
||||||
|
|
||||||
|
|||||||
39
pkg/docforge/docx/exporter.go
Normal file
39
pkg/docforge/docx/exporter.go
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
package docx
|
||||||
|
|
||||||
|
import "mgit.msbls.de/m/paliad/pkg/docforge"
|
||||||
|
|
||||||
|
// Exporter is the .docx implementation of docforge.Exporter — it renders a
|
||||||
|
// neutral Document to OOXML body markup (t-paliad-349 slice 8). The
|
||||||
|
// stylemap (block kind → Word paragraph style) and the optional hyperlink
|
||||||
|
// allocator are baked in at construction, so RenderBody matches the
|
||||||
|
// interface's format-neutral signature.
|
||||||
|
//
|
||||||
|
// This is the seam a future PDF/HTML exporter slots into: implement
|
||||||
|
// docforge.Exporter, no engine change. The submission composer can render
|
||||||
|
// section content through this exporter instead of calling
|
||||||
|
// RenderDocumentToOOXML directly once a second format exists.
|
||||||
|
type Exporter struct {
|
||||||
|
Stylemap map[string]string
|
||||||
|
Links HyperlinkAllocator
|
||||||
|
}
|
||||||
|
|
||||||
|
// compile-time conformance.
|
||||||
|
var _ docforge.Exporter = Exporter{}
|
||||||
|
|
||||||
|
// NewExporter builds a .docx exporter with the given stylemap + allocator.
|
||||||
|
func NewExporter(stylemap map[string]string, links HyperlinkAllocator) Exporter {
|
||||||
|
return Exporter{Stylemap: stylemap, Links: links}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Format returns the format id.
|
||||||
|
func (Exporter) Format() string { return "docx" }
|
||||||
|
|
||||||
|
// MIMEType returns the .docx container MIME type.
|
||||||
|
func (Exporter) MIMEType() string {
|
||||||
|
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
}
|
||||||
|
|
||||||
|
// RenderBody renders the Document to OOXML paragraph markup.
|
||||||
|
func (e Exporter) RenderBody(doc docforge.Document) ([]byte, error) {
|
||||||
|
return []byte(RenderDocumentToOOXML(doc, e.Stylemap, e.Links)), nil
|
||||||
|
}
|
||||||
34
pkg/docforge/docx/exporter_test.go
Normal file
34
pkg/docforge/docx/exporter_test.go
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
package docx
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"mgit.msbls.de/m/paliad/pkg/docforge"
|
||||||
|
"mgit.msbls.de/m/paliad/pkg/docforge/markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExporter_RenderBodyMatchesWalker(t *testing.T) {
|
||||||
|
exp := NewExporter(map[string]string{"paragraph": "Body"}, nil)
|
||||||
|
if exp.Format() != "docx" {
|
||||||
|
t.Errorf("Format = %q; want docx", exp.Format())
|
||||||
|
}
|
||||||
|
if !strings.Contains(exp.MIMEType(), "wordprocessingml.document") {
|
||||||
|
t.Errorf("MIMEType = %q", exp.MIMEType())
|
||||||
|
}
|
||||||
|
|
||||||
|
md := "Hello **world**\n\n- item"
|
||||||
|
// The Exporter must produce exactly what the walker entry point does
|
||||||
|
// for the same input (both go markdown.Import → RenderDocumentToOOXML).
|
||||||
|
body, err := exp.RenderBody(markdown.Import(md))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("RenderBody: %v", err)
|
||||||
|
}
|
||||||
|
want := RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": "Body"}, nil)
|
||||||
|
if string(body) != want {
|
||||||
|
t.Errorf("RenderBody mismatch:\n got %q\nwant %q", body, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// satisfies the interface (compile-time check mirrored at runtime).
|
||||||
|
var _ docforge.Exporter = Exporter{}
|
||||||
@@ -1,249 +1,78 @@
|
|||||||
package docx
|
package docx
|
||||||
|
|
||||||
// Markdown → OOXML walker for Composer section content (t-paliad-313
|
// Markdown → OOXML rendering for Composer section content (t-paliad-313
|
||||||
// Slice B, design doc §9.2).
|
// Slice B/D; restructured in t-paliad-349 slice 8).
|
||||||
//
|
//
|
||||||
// Scope per the head's Slice B brief: paragraphs + inline bold/italic
|
// Parsing now lives in pkg/docforge/markdown, which produces the neutral
|
||||||
// only. Headings, lists, blockquote, links land in Slice D's rich-prose
|
// docforge.Document. This file renders that Document into OOXML paragraph
|
||||||
// pass. This walker is intentionally minimal — every Markdown construct
|
// elements (<w:p>…</w:p>) ready to splice into a .docx body. There is one
|
||||||
// it doesn't recognise is rendered as a plain paragraph so the lawyer's
|
// Markdown parser for docforge; this is the .docx exporter for its model.
|
||||||
// prose round-trips losslessly even when they hit Markdown the walker
|
|
||||||
// doesn't yet understand.
|
|
||||||
//
|
//
|
||||||
// The output uses the base's stylemap.paragraph entry for the
|
// Output uses the base's stylemap entry for each block kind on the
|
||||||
// <w:pStyle> on each paragraph so the styling matches the base's
|
// <w:pStyle>, so styling matches the base's typography (HLpat-Body-B0 on
|
||||||
// typography (HLpat-Body-B0 on the HLC base, Normal on the neutral
|
// the HLC base, Normal on the neutral base, etc.). Placeholders ({{key}})
|
||||||
// base, etc.).
|
// ride through as literal run text and are substituted by the placeholder
|
||||||
//
|
// pass after assembly.
|
||||||
// Placeholders ({{path.dot.notation}}) are preserved verbatim — they
|
|
||||||
// pass through the walker untouched and get substituted by the v1
|
|
||||||
// SubmissionRenderer's placeholder pass after the composer assembly.
|
|
||||||
//
|
|
||||||
// Grammar supported:
|
|
||||||
//
|
|
||||||
// - Blank line → paragraph break
|
|
||||||
// - `**bold**` → <w:r><w:rPr><w:b/></w:rPr><w:t>…</w:t></w:r>
|
|
||||||
// - `*italic*` or `_italic_` → <w:r><w:rPr><w:i/></w:rPr>…</w:r>
|
|
||||||
// - Otherwise → plain text run
|
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
|
"mgit.msbls.de/m/paliad/pkg/docforge"
|
||||||
|
"mgit.msbls.de/m/paliad/pkg/docforge/markdown"
|
||||||
)
|
)
|
||||||
|
|
||||||
// HyperlinkAllocator hands the walker a `rId` for each external URL
|
// HyperlinkAllocator hands the renderer a `rId` for each external URL it
|
||||||
// it encounters in `[label](url)` inline links. The composer's
|
// encounters in `[label](url)` inline links. The composer's post-pass uses
|
||||||
// post-pass uses these allocations to mutate
|
// these allocations to mutate `word/_rels/document.xml.rels` so the emitted
|
||||||
// `word/_rels/document.xml.rels` so the emitted `<w:hyperlink
|
// `<w:hyperlink r:id="…">` elements resolve. Pass nil to drop links to
|
||||||
// r:id="…">` elements resolve correctly. Pass nil to drop links to
|
// plain text (the label survives, the URL doesn't render). t-paliad-316.
|
||||||
// plain text (the label survives, the URL doesn't render).
|
|
||||||
//
|
|
||||||
// t-paliad-316 Slice D.
|
|
||||||
type HyperlinkAllocator func(url string) string
|
type HyperlinkAllocator func(url string) string
|
||||||
|
|
||||||
// RenderMarkdownToOOXML renders the given Markdown source into OOXML
|
// RenderMarkdownToOOXML renders Markdown into OOXML paragraphs with a
|
||||||
// paragraph elements (`<w:p>…</w:p>`), suitable for splicing into a
|
// single paragraph style. Slice B back-compat wrapper.
|
||||||
// .docx body. Each paragraph carries `<w:pStyle w:val="<paragraphStyle>"/>`
|
|
||||||
// when paragraphStyle is non-empty.
|
|
||||||
//
|
|
||||||
// Slice B shipped paragraphs + bold/italic. Slice D extends to
|
|
||||||
// headings (h1/h2/h3), bullet/numbered lists, blockquote, and inline
|
|
||||||
// hyperlinks via the optional HyperlinkAllocator.
|
|
||||||
//
|
|
||||||
// stylemap supplies the paragraph-style names for each kind:
|
|
||||||
// stylemap["paragraph"] — default body
|
|
||||||
// stylemap["heading_1/2/3"] — heading levels
|
|
||||||
// stylemap["list_bullet"] — bullet list paragraph style
|
|
||||||
// stylemap["list_numbered"] — numbered list paragraph style
|
|
||||||
// stylemap["blockquote"] — blockquote
|
|
||||||
// Missing entries fall back to the "paragraph" style.
|
|
||||||
//
|
|
||||||
// Empty input renders one empty paragraph so the splice site is
|
|
||||||
// well-formed even when the lawyer hasn't typed anything in this
|
|
||||||
// section.
|
|
||||||
func RenderMarkdownToOOXML(md, paragraphStyle string) string {
|
func RenderMarkdownToOOXML(md, paragraphStyle string) string {
|
||||||
return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil)
|
return RenderMarkdownToOOXMLWithStyles(md, map[string]string{"paragraph": paragraphStyle}, nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
// RenderMarkdownToOOXMLWithStyles is the full Slice-D-aware entry
|
// RenderMarkdownToOOXMLWithStyles parses Markdown into a docforge.Document
|
||||||
// point. Slice B's RenderMarkdownToOOXML is a wrapper for back-compat.
|
// and renders it to OOXML. stylemap maps each block kind (paragraph,
|
||||||
|
// heading_1/2/3, list_bullet, list_numbered, blockquote) to a Word
|
||||||
|
// paragraph style; missing entries fall back to the "paragraph" style.
|
||||||
func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string {
|
func RenderMarkdownToOOXMLWithStyles(md string, stylemap map[string]string, links HyperlinkAllocator) string {
|
||||||
|
return RenderDocumentToOOXML(markdown.Import(md), stylemap, links)
|
||||||
|
}
|
||||||
|
|
||||||
|
// RenderDocumentToOOXML renders a neutral Document to OOXML paragraphs —
|
||||||
|
// the .docx side of the docforge importer→model→exporter pipeline. Any
|
||||||
|
// Document (Markdown today, a foreign-doc importer later) renders the same
|
||||||
|
// way.
|
||||||
|
func RenderDocumentToOOXML(doc docforge.Document, stylemap map[string]string, links HyperlinkAllocator) string {
|
||||||
defaultStyle := stylemap["paragraph"]
|
defaultStyle := stylemap["paragraph"]
|
||||||
if md == "" {
|
|
||||||
return emptyParagraph(defaultStyle)
|
|
||||||
}
|
|
||||||
blocks := splitMarkdownBlocks(md)
|
|
||||||
if len(blocks) == 0 {
|
|
||||||
return emptyParagraph(defaultStyle)
|
|
||||||
}
|
|
||||||
// Numbered-list counter resets on every non-numbered block so
|
// Numbered-list counter resets on every non-numbered block so
|
||||||
// "1. A\n2. B\n\n1. C" renders as 1./2./1. (the lawyer's input
|
// "1. A\n2. B\n\n1. C" renders 1./2./1. — the input determined the
|
||||||
// determined the ordinal, the walker just renders).
|
// ordinal, the renderer just emits it.
|
||||||
numberedCounter := 0
|
numbered := 0
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
for _, blk := range blocks {
|
for _, blk := range doc.Blocks {
|
||||||
style := stylemap[blk.styleKey]
|
style := stylemap[string(blk.Kind)]
|
||||||
if style == "" {
|
if style == "" {
|
||||||
style = defaultStyle
|
style = defaultStyle
|
||||||
}
|
}
|
||||||
if blk.styleKey == "list_numbered" {
|
if blk.Kind == docforge.KindListNumbered {
|
||||||
numberedCounter++
|
numbered++
|
||||||
} else {
|
} else {
|
||||||
numberedCounter = 0
|
numbered = 0
|
||||||
}
|
}
|
||||||
b.WriteString(renderBlockParagraph(blk, style, links, numberedCounter))
|
b.WriteString(renderBlock(blk, style, links, numbered))
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// mdBlock is one rendered paragraph: a kind (paragraph / heading_*
|
// renderBlock emits one <w:p> for a block. List blocks get a visible
|
||||||
// / list_bullet / list_numbered / blockquote) and the inline content
|
// "• " / "N. " prefix run (the base stylemap handles indentation if it
|
||||||
// text. List markers, heading hashes, blockquote `> ` etc. are
|
// defines a list style; the prefix at least surfaces the structure).
|
||||||
// stripped from the text before storage.
|
func renderBlock(blk docforge.Block, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
|
||||||
type mdBlock struct {
|
|
||||||
styleKey string // "paragraph" | "heading_1" | "heading_2" | "heading_3" | "list_bullet" | "list_numbered" | "blockquote"
|
|
||||||
text string
|
|
||||||
}
|
|
||||||
|
|
||||||
// splitMarkdownBlocks parses the source into a sequence of blocks,
|
|
||||||
// detecting heading / list / blockquote prefixes line-by-line. Blank
|
|
||||||
// lines split paragraph runs (same semantics as splitMarkdownParagraphs)
|
|
||||||
// but each line is also tagged with its block kind.
|
|
||||||
//
|
|
||||||
// Lines that look like block markers don't merge with their neighbours
|
|
||||||
// even across blank lines — every list / heading / blockquote line is
|
|
||||||
// its own block in the output. A run of unmarked lines collapses into
|
|
||||||
// one "paragraph" block (so soft line breaks inside a paragraph still
|
|
||||||
// concatenate).
|
|
||||||
//
|
|
||||||
// CRLF normalised to LF before parsing.
|
|
||||||
func splitMarkdownBlocks(md string) []mdBlock {
|
|
||||||
normalised := strings.ReplaceAll(md, "\r\n", "\n")
|
|
||||||
lines := strings.Split(normalised, "\n")
|
|
||||||
var blocks []mdBlock
|
|
||||||
var pendingPara []string
|
|
||||||
blankRun := 0
|
|
||||||
|
|
||||||
flushPara := func() {
|
|
||||||
if len(pendingPara) > 0 {
|
|
||||||
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: strings.Join(pendingPara, "\n")})
|
|
||||||
pendingPara = nil
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for _, raw := range lines {
|
|
||||||
line := raw
|
|
||||||
if strings.TrimSpace(line) == "" {
|
|
||||||
if len(pendingPara) > 0 {
|
|
||||||
flushPara()
|
|
||||||
blankRun = 1
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
blankRun++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Detect heading / list / blockquote markers BEFORE we accumulate
|
|
||||||
// into the paragraph buffer.
|
|
||||||
kind, payload, ok := detectBlockMarker(line)
|
|
||||||
if ok {
|
|
||||||
flushPara()
|
|
||||||
// Emit spacing paragraphs equivalent to (blankRun - 1) extra.
|
|
||||||
for i := 1; i < blankRun; i++ {
|
|
||||||
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
|
|
||||||
}
|
|
||||||
blankRun = 0
|
|
||||||
blocks = append(blocks, mdBlock{styleKey: kind, text: payload})
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Plain paragraph line.
|
|
||||||
if len(pendingPara) == 0 {
|
|
||||||
// Starting a new paragraph after a blank run — emit
|
|
||||||
// (blankRun-1) extra empty paragraphs for vertical spacing.
|
|
||||||
for i := 1; i < blankRun; i++ {
|
|
||||||
blocks = append(blocks, mdBlock{styleKey: "paragraph", text: ""})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
blankRun = 0
|
|
||||||
pendingPara = append(pendingPara, line)
|
|
||||||
}
|
|
||||||
flushPara()
|
|
||||||
return blocks
|
|
||||||
}
|
|
||||||
|
|
||||||
// detectBlockMarker classifies a single line. Returns (styleKey,
|
|
||||||
// payload-with-marker-stripped, true) for recognised markers; false
|
|
||||||
// for plain paragraph lines.
|
|
||||||
//
|
|
||||||
// Recognised markers (Slice D):
|
|
||||||
// # Heading → heading_1
|
|
||||||
// ## Heading → heading_2
|
|
||||||
// ### Heading → heading_3
|
|
||||||
// - item / * item → list_bullet
|
|
||||||
// 1. item / 2. item ... → list_numbered (any positive integer)
|
|
||||||
// > quote → blockquote
|
|
||||||
//
|
|
||||||
// Leading whitespace inside the line is tolerated up to 3 spaces (per
|
|
||||||
// CommonMark) so the lawyer's contentEditable indentation doesn't
|
|
||||||
// hide the marker.
|
|
||||||
func detectBlockMarker(line string) (string, string, bool) {
|
|
||||||
trimmed := strings.TrimLeft(line, " ")
|
|
||||||
// Cap to 3 spaces of leading indent — beyond that, treat as a
|
|
||||||
// regular paragraph line (matches CommonMark).
|
|
||||||
if len(line)-len(trimmed) > 3 {
|
|
||||||
return "", "", false
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(trimmed, "### ") {
|
|
||||||
return "heading_3", strings.TrimSpace(trimmed[4:]), true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(trimmed, "## ") {
|
|
||||||
return "heading_2", strings.TrimSpace(trimmed[3:]), true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(trimmed, "# ") {
|
|
||||||
return "heading_1", strings.TrimSpace(trimmed[2:]), true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(trimmed, "> ") {
|
|
||||||
return "blockquote", strings.TrimSpace(trimmed[2:]), true
|
|
||||||
}
|
|
||||||
if strings.HasPrefix(trimmed, "- ") || strings.HasPrefix(trimmed, "* ") {
|
|
||||||
return "list_bullet", strings.TrimSpace(trimmed[2:]), true
|
|
||||||
}
|
|
||||||
// Numbered: "N. " where N is one or more digits.
|
|
||||||
if i := indexOfNumberedMarker(trimmed); i > 0 {
|
|
||||||
return "list_numbered", strings.TrimSpace(trimmed[i:]), true
|
|
||||||
}
|
|
||||||
return "", "", false
|
|
||||||
}
|
|
||||||
|
|
||||||
// indexOfNumberedMarker checks for "N. " or "N) " at the start of the
|
|
||||||
// trimmed line; returns the byte index just past the marker, or -1 if
|
|
||||||
// no marker present.
|
|
||||||
func indexOfNumberedMarker(s string) int {
|
|
||||||
i := 0
|
|
||||||
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
if i == 0 {
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
if i >= len(s) {
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
if s[i] != '.' && s[i] != ')' {
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
if i+1 >= len(s) || s[i+1] != ' ' {
|
|
||||||
return -1
|
|
||||||
}
|
|
||||||
return i + 2
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderBlockParagraph emits one `<w:p>` for a block. List blocks
|
|
||||||
// keep the same paragraph style as a default paragraph (the Slice D
|
|
||||||
// design's contract — list styles come from the base's stylemap and
|
|
||||||
// Word's numbering.xml is honoured by adding a leading bullet/number
|
|
||||||
// prefix in the rendered text). This keeps the composer free of
|
|
||||||
// numbering.xml mutations.
|
|
||||||
func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAllocator, numberedOrdinal int) string {
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<w:p>`)
|
b.WriteString(`<w:p>`)
|
||||||
if paragraphStyle != "" {
|
if paragraphStyle != "" {
|
||||||
@@ -251,110 +80,61 @@ func renderBlockParagraph(blk mdBlock, paragraphStyle string, links HyperlinkAll
|
|||||||
b.WriteString(xmlAttrEscape(paragraphStyle))
|
b.WriteString(xmlAttrEscape(paragraphStyle))
|
||||||
b.WriteString(`"/></w:pPr>`)
|
b.WriteString(`"/></w:pPr>`)
|
||||||
}
|
}
|
||||||
if blk.text == "" {
|
// An empty block is an intentional empty paragraph: one empty run.
|
||||||
b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r>`)
|
if len(blk.Spans) == 0 {
|
||||||
b.WriteString(`</w:p>`)
|
b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r></w:p>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
text := blk.text
|
switch blk.Kind {
|
||||||
// List blocks emit a visible "• " / "N. " prefix run. The
|
case docforge.KindListBullet:
|
||||||
// stylemap entry handles paragraph indentation if the base
|
|
||||||
// defines a list paragraph style; otherwise the prefix at least
|
|
||||||
// surfaces the structure in plain Word. Lawyers who want Word's
|
|
||||||
// auto-numbering reapply a list style post-export.
|
|
||||||
switch blk.styleKey {
|
|
||||||
case "list_bullet":
|
|
||||||
b.WriteString(`<w:r><w:t xml:space="preserve">• </w:t></w:r>`)
|
b.WriteString(`<w:r><w:t xml:space="preserve">• </w:t></w:r>`)
|
||||||
case "list_numbered":
|
case docforge.KindListNumbered:
|
||||||
ordinal := numberedOrdinal
|
ordinal := numberedOrdinal
|
||||||
if ordinal <= 0 {
|
if ordinal <= 0 {
|
||||||
ordinal = 1
|
ordinal = 1
|
||||||
}
|
}
|
||||||
b.WriteString(`<w:r><w:t xml:space="preserve">`)
|
b.WriteString(`<w:r><w:t xml:space="preserve">`)
|
||||||
b.WriteString(fmt.Sprintf("%d. ", ordinal))
|
b.WriteString(strconv.Itoa(ordinal))
|
||||||
b.WriteString(`</w:t></w:r>`)
|
b.WriteString(`. </w:t></w:r>`)
|
||||||
}
|
}
|
||||||
for _, run := range parseInlineRuns(text, links) {
|
for _, span := range blk.Spans {
|
||||||
b.WriteString(run)
|
b.WriteString(renderInlineSpan(span, links))
|
||||||
}
|
}
|
||||||
b.WriteString(`</w:p>`)
|
b.WriteString(`</w:p>`)
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseInlineRuns extracts inline spans + hyperlink runs and serialises
|
// renderInlineSpan emits one span. A hyperlink span (Link != "") becomes a
|
||||||
// each to OOXML. Hyperlinks become `<w:hyperlink r:id="RID">…runs…</w:hyperlink>`
|
// <w:hyperlink r:id="…"> wrapping its children when an allocator yields a
|
||||||
// where RID comes from the HyperlinkAllocator.
|
// rId; otherwise the label children render as plain runs (URL dropped).
|
||||||
func parseInlineRuns(text string, links HyperlinkAllocator) []string {
|
func renderInlineSpan(span docforge.InlineSpan, links HyperlinkAllocator) string {
|
||||||
// Phase 1: find all hyperlink spans `[label](url)` and split the
|
if span.Link != "" {
|
||||||
// text around them.
|
if links != nil {
|
||||||
type segment struct {
|
if rid := links(span.Link); rid != "" {
|
||||||
text string
|
|
||||||
isLink bool
|
|
||||||
url string
|
|
||||||
}
|
|
||||||
var segs []segment
|
|
||||||
rest := text
|
|
||||||
for {
|
|
||||||
idx := strings.Index(rest, "[")
|
|
||||||
if idx < 0 {
|
|
||||||
if rest != "" {
|
|
||||||
segs = append(segs, segment{text: rest})
|
|
||||||
}
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// Find matching closing bracket, then a "(" right after.
|
|
||||||
closeBracket := strings.Index(rest[idx:], "](")
|
|
||||||
if closeBracket < 0 {
|
|
||||||
segs = append(segs, segment{text: rest})
|
|
||||||
break
|
|
||||||
}
|
|
||||||
closeParen := strings.Index(rest[idx+closeBracket:], ")")
|
|
||||||
if closeParen < 0 {
|
|
||||||
segs = append(segs, segment{text: rest})
|
|
||||||
break
|
|
||||||
}
|
|
||||||
// idx = start of "["
|
|
||||||
// idx+closeBracket = position of "]"
|
|
||||||
// idx+closeBracket+1 = position of "("
|
|
||||||
// idx+closeBracket+closeParen = position of ")"
|
|
||||||
label := rest[idx+1 : idx+closeBracket]
|
|
||||||
url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
|
|
||||||
if idx > 0 {
|
|
||||||
segs = append(segs, segment{text: rest[:idx]})
|
|
||||||
}
|
|
||||||
segs = append(segs, segment{text: label, isLink: true, url: url})
|
|
||||||
rest = rest[idx+closeBracket+closeParen+1:]
|
|
||||||
}
|
|
||||||
|
|
||||||
var runs []string
|
|
||||||
for _, seg := range segs {
|
|
||||||
if seg.isLink && links != nil {
|
|
||||||
rid := links(seg.url)
|
|
||||||
if rid != "" {
|
|
||||||
var hb strings.Builder
|
var hb strings.Builder
|
||||||
hb.WriteString(`<w:hyperlink r:id="`)
|
hb.WriteString(`<w:hyperlink r:id="`)
|
||||||
hb.WriteString(xmlAttrEscape(rid))
|
hb.WriteString(xmlAttrEscape(rid))
|
||||||
hb.WriteString(`">`)
|
hb.WriteString(`">`)
|
||||||
for _, span := range parseInlineSpans(seg.text) {
|
for _, child := range span.Children {
|
||||||
hb.WriteString(renderRunWithLinkStyle(span))
|
hb.WriteString(renderRunWithLinkStyle(child))
|
||||||
}
|
}
|
||||||
hb.WriteString(`</w:hyperlink>`)
|
hb.WriteString(`</w:hyperlink>`)
|
||||||
runs = append(runs, hb.String())
|
return hb.String()
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for _, span := range parseInlineSpans(seg.text) {
|
// No allocator / no rId — render the label as plain runs.
|
||||||
runs = append(runs, renderRun(span))
|
var fb strings.Builder
|
||||||
|
for _, child := range span.Children {
|
||||||
|
fb.WriteString(renderRun(child))
|
||||||
}
|
}
|
||||||
|
return fb.String()
|
||||||
}
|
}
|
||||||
return runs
|
return renderRun(span)
|
||||||
}
|
}
|
||||||
|
|
||||||
// renderRunWithLinkStyle emits a hyperlink child run. Same B/I support
|
// renderRunWithLinkStyle emits a hyperlink child run with Word's built-in
|
||||||
// as renderRun, but additionally tags the run with the "Hyperlink"
|
// "Hyperlink" character style (colour + underline), plus B/I.
|
||||||
// character style (Word's built-in) so the link renders in the
|
func renderRunWithLinkStyle(span docforge.InlineSpan) string {
|
||||||
// document's hyperlink colour + underline.
|
|
||||||
func renderRunWithLinkStyle(span inlineSpan) string {
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<w:r><w:rPr><w:rStyle w:val="Hyperlink"/>`)
|
b.WriteString(`<w:r><w:rPr><w:rStyle w:val="Hyperlink"/>`)
|
||||||
if span.Bold {
|
if span.Bold {
|
||||||
@@ -369,85 +149,8 @@ func renderRunWithLinkStyle(span inlineSpan) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// inlineSpan is one piece of inline content: a text payload plus
|
// renderRun emits one <w:r> for a plain (text/bold/italic) span.
|
||||||
// formatting flags. Bold and italic are independent — `***both***`
|
func renderRun(span docforge.InlineSpan) string {
|
||||||
// produces one span with both flags set.
|
|
||||||
type inlineSpan struct {
|
|
||||||
Text string
|
|
||||||
Bold bool
|
|
||||||
Italic bool
|
|
||||||
}
|
|
||||||
|
|
||||||
// parseInlineSpans tokenises Markdown inline formatting into runs of
|
|
||||||
// (text, bold, italic). The grammar is intentionally narrow:
|
|
||||||
//
|
|
||||||
// - `**…**` → bold
|
|
||||||
// - `__…__` → bold (Markdown alternate)
|
|
||||||
// - `*…*` → italic
|
|
||||||
// - `_…_` → italic (Markdown alternate)
|
|
||||||
// - Anything else flows through as plain text.
|
|
||||||
//
|
|
||||||
// Unbalanced delimiters fall through as literal characters — the
|
|
||||||
// walker never errors on malformed Markdown. Nested formatting (e.g.
|
|
||||||
// `**bold *bold-italic* bold**`) toggles flags as it walks.
|
|
||||||
func parseInlineSpans(text string) []inlineSpan {
|
|
||||||
var out []inlineSpan
|
|
||||||
var cur strings.Builder
|
|
||||||
bold := false
|
|
||||||
italic := false
|
|
||||||
flush := func() {
|
|
||||||
if cur.Len() == 0 {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
out = append(out, inlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
|
|
||||||
cur.Reset()
|
|
||||||
}
|
|
||||||
i := 0
|
|
||||||
n := len(text)
|
|
||||||
for i < n {
|
|
||||||
// Preserve {{...}} placeholders verbatim. Underscores and
|
|
||||||
// other Markdown-significant chars inside a placeholder key
|
|
||||||
// (e.g. {{project.case_number}}) must not be interpreted as
|
|
||||||
// bold/italic delimiters — otherwise the key gets stripped of
|
|
||||||
// its underscores and the v1 placeholder pass looks up the
|
|
||||||
// wrong key, surfacing [KEIN WERT: project.casenumber] in the
|
|
||||||
// preview.
|
|
||||||
if i+1 < n && text[i] == '{' && text[i+1] == '{' {
|
|
||||||
rel := strings.Index(text[i+2:], "}}")
|
|
||||||
if rel >= 0 {
|
|
||||||
end := i + 2 + rel + 2
|
|
||||||
cur.WriteString(text[i:end])
|
|
||||||
i = end
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Unmatched {{ — fall through to plain character handling.
|
|
||||||
}
|
|
||||||
// Bold delimiters first (longer match wins over italic).
|
|
||||||
if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
|
|
||||||
flush()
|
|
||||||
bold = !bold
|
|
||||||
i += 2
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if text[i] == '*' || text[i] == '_' {
|
|
||||||
flush()
|
|
||||||
italic = !italic
|
|
||||||
i++
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
cur.WriteByte(text[i])
|
|
||||||
i++
|
|
||||||
}
|
|
||||||
flush()
|
|
||||||
if len(out) == 0 {
|
|
||||||
out = append(out, inlineSpan{Text: ""})
|
|
||||||
}
|
|
||||||
return out
|
|
||||||
}
|
|
||||||
|
|
||||||
// renderRun emits one `<w:r>` element for an inline span. Empty text
|
|
||||||
// spans render as empty runs (Word accepts them; they're harmless).
|
|
||||||
func renderRun(span inlineSpan) string {
|
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
b.WriteString(`<w:r>`)
|
b.WriteString(`<w:r>`)
|
||||||
if span.Bold || span.Italic {
|
if span.Bold || span.Italic {
|
||||||
@@ -466,34 +169,16 @@ func renderRun(span inlineSpan) string {
|
|||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// emptyParagraph returns one empty `<w:p>` with the given style. Used
|
// xmlTextEscape escapes the XML-significant characters for <w:t> content.
|
||||||
// when a section's content_md is empty so the splice site stays
|
// Quotes/apostrophes are legal in element text — not escaped.
|
||||||
// well-formed.
|
|
||||||
func emptyParagraph(paragraphStyle string) string {
|
|
||||||
var b strings.Builder
|
|
||||||
b.WriteString(`<w:p>`)
|
|
||||||
if paragraphStyle != "" {
|
|
||||||
b.WriteString(`<w:pPr><w:pStyle w:val="`)
|
|
||||||
b.WriteString(xmlAttrEscape(paragraphStyle))
|
|
||||||
b.WriteString(`"/></w:pPr>`)
|
|
||||||
}
|
|
||||||
b.WriteString(`<w:r><w:t xml:space="preserve"></w:t></w:r></w:p>`)
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
// xmlTextEscape escapes the five XML-significant characters for safe
|
|
||||||
// insertion into <w:t> content. & first to avoid double-encoding.
|
|
||||||
func xmlTextEscape(s string) string {
|
func xmlTextEscape(s string) string {
|
||||||
s = strings.ReplaceAll(s, "&", "&")
|
s = strings.ReplaceAll(s, "&", "&")
|
||||||
s = strings.ReplaceAll(s, "<", "<")
|
s = strings.ReplaceAll(s, "<", "<")
|
||||||
s = strings.ReplaceAll(s, ">", ">")
|
s = strings.ReplaceAll(s, ">", ">")
|
||||||
// Quotes and apostrophes are legal inside element text content;
|
|
||||||
// no need to escape them here.
|
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// xmlAttrEscape escapes for safe insertion into an attribute value
|
// xmlAttrEscape escapes for an attribute value (e.g. <w:pStyle w:val="…"/>).
|
||||||
// (e.g. `<w:pStyle w:val="…"/>`).
|
|
||||||
func xmlAttrEscape(s string) string {
|
func xmlAttrEscape(s string) string {
|
||||||
s = strings.ReplaceAll(s, "&", "&")
|
s = strings.ReplaceAll(s, "&", "&")
|
||||||
s = strings.ReplaceAll(s, "<", "<")
|
s = strings.ReplaceAll(s, "<", "<")
|
||||||
|
|||||||
@@ -112,46 +112,6 @@ func TestRenderMarkdownToOOXML_PlaceholderUnderscoresPreserved(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseInlineSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) {
|
|
||||||
// Direct guard on the inline scanner. {{project.case_number}} must
|
|
||||||
// emit as a single non-italic span containing the full placeholder.
|
|
||||||
spans := parseInlineSpans("{{project.case_number}}")
|
|
||||||
if len(spans) != 1 {
|
|
||||||
t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans)
|
|
||||||
}
|
|
||||||
if spans[0].Italic || spans[0].Bold {
|
|
||||||
t.Errorf("placeholder must not be italic/bold; got %+v", spans[0])
|
|
||||||
}
|
|
||||||
if spans[0].Text != "{{project.case_number}}" {
|
|
||||||
t.Errorf("placeholder text corrupted: got %q", spans[0].Text)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseInlineSpans_ItalicAroundPlaceholder(t *testing.T) {
|
|
||||||
// Italic delimiters outside a placeholder still work; the placeholder
|
|
||||||
// itself stays literal even when it sits between italics.
|
|
||||||
spans := parseInlineSpans("_before_ {{x.y_z}} _after_")
|
|
||||||
var saw struct {
|
|
||||||
italicBefore bool
|
|
||||||
placeholder bool
|
|
||||||
italicAfter bool
|
|
||||||
}
|
|
||||||
for _, s := range spans {
|
|
||||||
if s.Italic && s.Text == "before" {
|
|
||||||
saw.italicBefore = true
|
|
||||||
}
|
|
||||||
if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") {
|
|
||||||
saw.placeholder = true
|
|
||||||
}
|
|
||||||
if s.Italic && s.Text == "after" {
|
|
||||||
saw.italicAfter = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if !saw.italicBefore || !saw.placeholder || !saw.italicAfter {
|
|
||||||
t.Errorf("expected italic/placeholder/italic structure; got %+v", spans)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// extractPlaceholders pulls every {{...}} occurrence out of a Markdown
|
// extractPlaceholders pulls every {{...}} occurrence out of a Markdown
|
||||||
// source. Tiny helper, only used by the regression test above.
|
// source. Tiny helper, only used by the regression test above.
|
||||||
func extractPlaceholders(s string) []string {
|
func extractPlaceholders(s string) []string {
|
||||||
@@ -196,39 +156,6 @@ func TestRenderMarkdownToOOXML_CRLFNormalisation(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestParseInlineSpans_Plain(t *testing.T) {
|
|
||||||
spans := parseInlineSpans("hello world")
|
|
||||||
if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" {
|
|
||||||
t.Errorf("expected single plain span; got %+v", spans)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseInlineSpans_UnderscoreItalic(t *testing.T) {
|
|
||||||
spans := parseInlineSpans("_emph_")
|
|
||||||
var italicHits int
|
|
||||||
for _, s := range spans {
|
|
||||||
if s.Italic && s.Text == "emph" {
|
|
||||||
italicHits++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if italicHits != 1 {
|
|
||||||
t.Errorf("expected one italic 'emph' span; got %+v", spans)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestParseInlineSpans_UnderscoreBold(t *testing.T) {
|
|
||||||
spans := parseInlineSpans("__strong__")
|
|
||||||
var boldHits int
|
|
||||||
for _, s := range spans {
|
|
||||||
if s.Bold && s.Text == "strong" {
|
|
||||||
boldHits++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if boldHits != 1 {
|
|
||||||
t.Errorf("expected one bold 'strong' span; got %+v", spans)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────
|
||||||
// Slice D — rich-prose constructs
|
// Slice D — rich-prose constructs
|
||||||
// ─────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────
|
||||||
@@ -349,35 +276,3 @@ func TestRenderMarkdownToOOXML_HyperlinkNilAllocatorFallsBackToPlain(t *testing.
|
|||||||
t.Errorf("hyperlink emitted without allocator: %q", out)
|
t.Errorf("hyperlink emitted without allocator: %q", out)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestDetectBlockMarker(t *testing.T) {
|
|
||||||
cases := []struct {
|
|
||||||
in string
|
|
||||||
kind string
|
|
||||||
want string
|
|
||||||
ok bool
|
|
||||||
}{
|
|
||||||
{"# A", "heading_1", "A", true},
|
|
||||||
{"## B", "heading_2", "B", true},
|
|
||||||
{"### C", "heading_3", "C", true},
|
|
||||||
{" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated
|
|
||||||
{" # too-deep", "", "", false}, // 4 spaces → not a heading
|
|
||||||
{"- bullet", "list_bullet", "bullet", true},
|
|
||||||
{"* star", "list_bullet", "star", true},
|
|
||||||
{"1. one", "list_numbered", "one", true},
|
|
||||||
{"42. forty-two", "list_numbered", "forty-two", true},
|
|
||||||
{"1) paren", "list_numbered", "paren", true},
|
|
||||||
{"1.no-space", "", "", false}, // ordinal needs trailing space
|
|
||||||
{"> quote", "blockquote", "quote", true},
|
|
||||||
{"plain", "", "", false},
|
|
||||||
{"#nospace", "", "", false}, // heading needs space after hash
|
|
||||||
}
|
|
||||||
for _, tc := range cases {
|
|
||||||
t.Run(tc.in, func(t *testing.T) {
|
|
||||||
kind, payload, ok := detectBlockMarker(tc.in)
|
|
||||||
if ok != tc.ok || kind != tc.kind || payload != tc.want {
|
|
||||||
t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok)
|
|
||||||
}
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
22
pkg/docforge/exporter.go
Normal file
22
pkg/docforge/exporter.go
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
package docforge
|
||||||
|
|
||||||
|
// Exporter renders a neutral Document into a target format's body markup.
|
||||||
|
// docforge owns the interface; each format adapter implements it (the
|
||||||
|
// .docx adapter in pkg/docforge/docx today; .pdf/.html/.md are future
|
||||||
|
// siblings — PRD §4 B4: interface now, docx-only impl). Format-specific
|
||||||
|
// configuration (a stylemap, a hyperlink allocator for .docx) is baked into
|
||||||
|
// the concrete exporter at construction, so the interface stays
|
||||||
|
// format-neutral.
|
||||||
|
//
|
||||||
|
// "Body markup" is the renderable content fragment, not a complete file —
|
||||||
|
// for .docx it is the OOXML <w:p> run the composer splices into a carrier.
|
||||||
|
// Container concerns (MIME type, packaging) are described by Format /
|
||||||
|
// MIMEType and handled by the assembling layer.
|
||||||
|
type Exporter interface {
|
||||||
|
// Format is the short format id, e.g. "docx".
|
||||||
|
Format() string
|
||||||
|
// MIMEType is the container MIME type the assembled document carries.
|
||||||
|
MIMEType() string
|
||||||
|
// RenderBody renders the document to the format's body markup.
|
||||||
|
RenderBody(doc Document) ([]byte, error)
|
||||||
|
}
|
||||||
230
pkg/docforge/markdown/importer.go
Normal file
230
pkg/docforge/markdown/importer.go
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
// Package markdown imports Markdown source into the neutral
|
||||||
|
// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary
|
||||||
|
// input format). It is the single Markdown parser for docforge: the .docx
|
||||||
|
// renderer consumes the Document this produces, so block-splitting and
|
||||||
|
// inline tokenisation live here, not in the format adapter.
|
||||||
|
//
|
||||||
|
// Grammar (intentionally narrow — unrecognised syntax flows through as a
|
||||||
|
// plain paragraph, so lawyer prose never errors):
|
||||||
|
//
|
||||||
|
// blank line → paragraph break
|
||||||
|
// # / ## / ### Heading → heading_1 / 2 / 3
|
||||||
|
// - item / * item → bullet list item
|
||||||
|
// N. item / N) item → numbered list item
|
||||||
|
// > quote → blockquote
|
||||||
|
// **x** / __x__ → bold
|
||||||
|
// *x* / _x_ → italic
|
||||||
|
// [label](url) → hyperlink
|
||||||
|
// {{key}} → preserved verbatim (substituted downstream)
|
||||||
|
package markdown
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"mgit.msbls.de/m/paliad/pkg/docforge"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Import parses Markdown into a Document. Empty (or all-blank) input yields
|
||||||
|
// a single empty paragraph so a splice site stays well-formed.
|
||||||
|
func Import(md string) docforge.Document {
|
||||||
|
blocks := splitBlocks(md)
|
||||||
|
if len(blocks) == 0 {
|
||||||
|
return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}}
|
||||||
|
}
|
||||||
|
out := make([]docforge.Block, 0, len(blocks))
|
||||||
|
for _, blk := range blocks {
|
||||||
|
b := docforge.Block{Kind: docforge.BlockKind(blk.kind)}
|
||||||
|
// An empty-text block is an intentional empty paragraph: leave
|
||||||
|
// Spans nil so the exporter emits a single empty run.
|
||||||
|
if blk.text != "" {
|
||||||
|
b.Spans = parseInline(blk.text)
|
||||||
|
}
|
||||||
|
out = append(out, b)
|
||||||
|
}
|
||||||
|
return docforge.Document{Blocks: out}
|
||||||
|
}
|
||||||
|
|
||||||
|
// rawBlock is the intermediate (kind, stripped-text) form before inline
|
||||||
|
// parsing. kind values match docforge.BlockKind string values.
|
||||||
|
type rawBlock struct {
|
||||||
|
kind string
|
||||||
|
text string
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitBlocks parses the source into a sequence of (kind, text) blocks,
|
||||||
|
// detecting heading / list / blockquote prefixes line-by-line. A run of
|
||||||
|
// unmarked lines collapses into one paragraph block (soft line breaks
|
||||||
|
// inside a paragraph concatenate); each marked line is its own block.
|
||||||
|
// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised.
|
||||||
|
func splitBlocks(md string) []rawBlock {
|
||||||
|
normalised := strings.ReplaceAll(md, "\r\n", "\n")
|
||||||
|
lines := strings.Split(normalised, "\n")
|
||||||
|
var blocks []rawBlock
|
||||||
|
var pendingPara []string
|
||||||
|
blankRun := 0
|
||||||
|
|
||||||
|
flushPara := func() {
|
||||||
|
if len(pendingPara) > 0 {
|
||||||
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")})
|
||||||
|
pendingPara = nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, line := range lines {
|
||||||
|
if strings.TrimSpace(line) == "" {
|
||||||
|
if len(pendingPara) > 0 {
|
||||||
|
flushPara()
|
||||||
|
blankRun = 1
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
blankRun++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if kind, payload, ok := detectBlockMarker(line); ok {
|
||||||
|
flushPara()
|
||||||
|
for i := 1; i < blankRun; i++ {
|
||||||
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
|
||||||
|
}
|
||||||
|
blankRun = 0
|
||||||
|
blocks = append(blocks, rawBlock{kind: kind, text: payload})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if len(pendingPara) == 0 {
|
||||||
|
for i := 1; i < blankRun; i++ {
|
||||||
|
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
blankRun = 0
|
||||||
|
pendingPara = append(pendingPara, line)
|
||||||
|
}
|
||||||
|
flushPara()
|
||||||
|
return blocks
|
||||||
|
}
|
||||||
|
|
||||||
|
// detectBlockMarker classifies a single line. Tolerates up to 3 leading
|
||||||
|
// spaces (CommonMark) before treating the line as a plain paragraph.
|
||||||
|
func detectBlockMarker(line string) (kind, payload string, ok bool) {
|
||||||
|
trimmed := strings.TrimLeft(line, " ")
|
||||||
|
if len(line)-len(trimmed) > 3 {
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
switch {
|
||||||
|
case strings.HasPrefix(trimmed, "### "):
|
||||||
|
return "heading_3", strings.TrimSpace(trimmed[4:]), true
|
||||||
|
case strings.HasPrefix(trimmed, "## "):
|
||||||
|
return "heading_2", strings.TrimSpace(trimmed[3:]), true
|
||||||
|
case strings.HasPrefix(trimmed, "# "):
|
||||||
|
return "heading_1", strings.TrimSpace(trimmed[2:]), true
|
||||||
|
case strings.HasPrefix(trimmed, "> "):
|
||||||
|
return "blockquote", strings.TrimSpace(trimmed[2:]), true
|
||||||
|
case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "):
|
||||||
|
return "list_bullet", strings.TrimSpace(trimmed[2:]), true
|
||||||
|
}
|
||||||
|
if i := indexOfNumberedMarker(trimmed); i > 0 {
|
||||||
|
return "list_numbered", strings.TrimSpace(trimmed[i:]), true
|
||||||
|
}
|
||||||
|
return "", "", false
|
||||||
|
}
|
||||||
|
|
||||||
|
// indexOfNumberedMarker returns the byte index just past an "N. " / "N) "
|
||||||
|
// marker at the start of s, or -1 when absent.
|
||||||
|
func indexOfNumberedMarker(s string) int {
|
||||||
|
i := 0
|
||||||
|
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
if i == 0 || i >= len(s) {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
if s[i] != '.' && s[i] != ')' {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
if i+1 >= len(s) || s[i+1] != ' ' {
|
||||||
|
return -1
|
||||||
|
}
|
||||||
|
return i + 2
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseInline splits text around [label](url) hyperlinks and tokenises the
|
||||||
|
// rest into bold/italic spans. Hyperlinks become a span with Link set and
|
||||||
|
// the label's spans as Children, preserving link boundaries.
|
||||||
|
func parseInline(text string) []docforge.InlineSpan {
|
||||||
|
var out []docforge.InlineSpan
|
||||||
|
rest := text
|
||||||
|
for {
|
||||||
|
idx := strings.Index(rest, "[")
|
||||||
|
if idx < 0 {
|
||||||
|
if rest != "" {
|
||||||
|
out = append(out, parseSpans(rest)...)
|
||||||
|
}
|
||||||
|
break
|
||||||
|
}
|
||||||
|
closeBracket := strings.Index(rest[idx:], "](")
|
||||||
|
if closeBracket < 0 {
|
||||||
|
out = append(out, parseSpans(rest)...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
closeParen := strings.Index(rest[idx+closeBracket:], ")")
|
||||||
|
if closeParen < 0 {
|
||||||
|
out = append(out, parseSpans(rest)...)
|
||||||
|
break
|
||||||
|
}
|
||||||
|
label := rest[idx+1 : idx+closeBracket]
|
||||||
|
url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
|
||||||
|
if idx > 0 {
|
||||||
|
out = append(out, parseSpans(rest[:idx])...)
|
||||||
|
}
|
||||||
|
out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)})
|
||||||
|
rest = rest[idx+closeBracket+closeParen+1:]
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
// parseSpans tokenises Markdown inline bold/italic into spans, preserving
|
||||||
|
// {{...}} placeholders verbatim (the b78a984 fix — underscores in a
|
||||||
|
// placeholder key must not be read as italic delimiters). Empty input
|
||||||
|
// yields one empty span.
|
||||||
|
func parseSpans(text string) []docforge.InlineSpan {
|
||||||
|
var out []docforge.InlineSpan
|
||||||
|
var cur strings.Builder
|
||||||
|
bold := false
|
||||||
|
italic := false
|
||||||
|
flush := func() {
|
||||||
|
if cur.Len() == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
|
||||||
|
cur.Reset()
|
||||||
|
}
|
||||||
|
i := 0
|
||||||
|
n := len(text)
|
||||||
|
for i < n {
|
||||||
|
if i+1 < n && text[i] == '{' && text[i+1] == '{' {
|
||||||
|
if rel := strings.Index(text[i+2:], "}}"); rel >= 0 {
|
||||||
|
end := i + 2 + rel + 2
|
||||||
|
cur.WriteString(text[i:end])
|
||||||
|
i = end
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
|
||||||
|
flush()
|
||||||
|
bold = !bold
|
||||||
|
i += 2
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if text[i] == '*' || text[i] == '_' {
|
||||||
|
flush()
|
||||||
|
italic = !italic
|
||||||
|
i++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
cur.WriteByte(text[i])
|
||||||
|
i++
|
||||||
|
}
|
||||||
|
flush()
|
||||||
|
if len(out) == 0 {
|
||||||
|
out = append(out, docforge.InlineSpan{Text: ""})
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
145
pkg/docforge/markdown/importer_test.go
Normal file
145
pkg/docforge/markdown/importer_test.go
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
package markdown
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Inline-span + block-marker tests, relocated from the docx walker when
|
||||||
|
// parsing moved here (t-paliad-349 slice 8). parseSpans is the inline
|
||||||
|
// tokeniser; detectBlockMarker classifies a line.
|
||||||
|
|
||||||
|
func TestParseSpans_PlaceholderWithUnderscoresIsLiteral(t *testing.T) {
|
||||||
|
// {{project.case_number}} must emit as a single non-italic span
|
||||||
|
// containing the full placeholder (the b78a984 fix).
|
||||||
|
spans := parseSpans("{{project.case_number}}")
|
||||||
|
if len(spans) != 1 {
|
||||||
|
t.Fatalf("expected 1 span; got %d (%+v)", len(spans), spans)
|
||||||
|
}
|
||||||
|
if spans[0].Italic || spans[0].Bold {
|
||||||
|
t.Errorf("placeholder must not be italic/bold; got %+v", spans[0])
|
||||||
|
}
|
||||||
|
if spans[0].Text != "{{project.case_number}}" {
|
||||||
|
t.Errorf("placeholder text corrupted: got %q", spans[0].Text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSpans_ItalicAroundPlaceholder(t *testing.T) {
|
||||||
|
spans := parseSpans("_before_ {{x.y_z}} _after_")
|
||||||
|
var saw struct {
|
||||||
|
italicBefore bool
|
||||||
|
placeholder bool
|
||||||
|
italicAfter bool
|
||||||
|
}
|
||||||
|
for _, s := range spans {
|
||||||
|
if s.Italic && s.Text == "before" {
|
||||||
|
saw.italicBefore = true
|
||||||
|
}
|
||||||
|
if !s.Italic && !s.Bold && strings.Contains(s.Text, "{{x.y_z}}") {
|
||||||
|
saw.placeholder = true
|
||||||
|
}
|
||||||
|
if s.Italic && s.Text == "after" {
|
||||||
|
saw.italicAfter = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !saw.italicBefore || !saw.placeholder || !saw.italicAfter {
|
||||||
|
t.Errorf("expected italic/placeholder/italic structure; got %+v", spans)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSpans_Plain(t *testing.T) {
|
||||||
|
spans := parseSpans("hello world")
|
||||||
|
if len(spans) != 1 || spans[0].Bold || spans[0].Italic || spans[0].Text != "hello world" {
|
||||||
|
t.Errorf("expected single plain span; got %+v", spans)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSpans_UnderscoreItalic(t *testing.T) {
|
||||||
|
spans := parseSpans("_emph_")
|
||||||
|
var italicHits int
|
||||||
|
for _, s := range spans {
|
||||||
|
if s.Italic && s.Text == "emph" {
|
||||||
|
italicHits++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if italicHits != 1 {
|
||||||
|
t.Errorf("expected one italic 'emph' span; got %+v", spans)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseSpans_UnderscoreBold(t *testing.T) {
|
||||||
|
spans := parseSpans("__strong__")
|
||||||
|
var boldHits int
|
||||||
|
for _, s := range spans {
|
||||||
|
if s.Bold && s.Text == "strong" {
|
||||||
|
boldHits++
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if boldHits != 1 {
|
||||||
|
t.Errorf("expected one bold 'strong' span; got %+v", spans)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestDetectBlockMarker(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
in string
|
||||||
|
kind string
|
||||||
|
want string
|
||||||
|
ok bool
|
||||||
|
}{
|
||||||
|
{"# A", "heading_1", "A", true},
|
||||||
|
{"## B", "heading_2", "B", true},
|
||||||
|
{"### C", "heading_3", "C", true},
|
||||||
|
{" # indented", "heading_1", "indented", true}, // up to 3 spaces tolerated
|
||||||
|
{" # too-deep", "", "", false}, // 4 spaces → not a heading
|
||||||
|
{"- bullet", "list_bullet", "bullet", true},
|
||||||
|
{"* star", "list_bullet", "star", true},
|
||||||
|
{"1. one", "list_numbered", "one", true},
|
||||||
|
{"42. forty-two", "list_numbered", "forty-two", true},
|
||||||
|
{"1) paren", "list_numbered", "paren", true},
|
||||||
|
{"1.no-space", "", "", false}, // ordinal needs trailing space
|
||||||
|
{"> quote", "blockquote", "quote", true},
|
||||||
|
{"plain", "", "", false},
|
||||||
|
{"#nospace", "", "", false}, // heading needs space after hash
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.in, func(t *testing.T) {
|
||||||
|
kind, payload, ok := detectBlockMarker(tc.in)
|
||||||
|
if ok != tc.ok || kind != tc.kind || payload != tc.want {
|
||||||
|
t.Errorf("detectBlockMarker(%q) = (%q,%q,%v); want (%q,%q,%v)", tc.in, kind, payload, ok, tc.kind, tc.want, tc.ok)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestImport_Document spot-checks the neutral Document the importer
|
||||||
|
// produces — block kinds, the link-span shape, and placeholder pass-through.
|
||||||
|
func TestImport_Document(t *testing.T) {
|
||||||
|
doc := Import("# Title\n\nBody **bold** and [label](http://x).\n\n- item")
|
||||||
|
if len(doc.Blocks) != 3 {
|
||||||
|
t.Fatalf("blocks = %d; want 3 (%+v)", len(doc.Blocks), doc.Blocks)
|
||||||
|
}
|
||||||
|
if doc.Blocks[0].Kind != "heading_1" {
|
||||||
|
t.Errorf("block0 kind = %q; want heading_1", doc.Blocks[0].Kind)
|
||||||
|
}
|
||||||
|
if doc.Blocks[2].Kind != "list_bullet" {
|
||||||
|
t.Errorf("block2 kind = %q; want list_bullet", doc.Blocks[2].Kind)
|
||||||
|
}
|
||||||
|
// The body paragraph carries a link span with Link set + children.
|
||||||
|
var sawLink bool
|
||||||
|
for _, s := range doc.Blocks[1].Spans {
|
||||||
|
if s.Link == "http://x" && len(s.Children) > 0 {
|
||||||
|
sawLink = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !sawLink {
|
||||||
|
t.Errorf("body block missing link span; got %+v", doc.Blocks[1].Spans)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImport_EmptyYieldsOneEmptyParagraph(t *testing.T) {
|
||||||
|
doc := Import("")
|
||||||
|
if len(doc.Blocks) != 1 || doc.Blocks[0].Kind != "paragraph" || len(doc.Blocks[0].Spans) != 0 {
|
||||||
|
t.Errorf("empty import = %+v; want one empty paragraph block", doc.Blocks)
|
||||||
|
}
|
||||||
|
}
|
||||||
58
pkg/docforge/model.go
Normal file
58
pkg/docforge/model.go
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
package docforge
|
||||||
|
|
||||||
|
// The neutral document model — the format-independent representation an
|
||||||
|
// importer produces and an exporter consumes (PRD §3.2). A Markdown
|
||||||
|
// importer parses source into a Document; the .docx exporter renders a
|
||||||
|
// Document into OOXML; a future PDF/HTML exporter renders the same
|
||||||
|
// Document differently. The model carries editable content only —
|
||||||
|
// placeholders ({{key}}) ride through as literal span text and are
|
||||||
|
// substituted later by the format exporter's merge pass, exactly as in
|
||||||
|
// the pre-model pipeline.
|
||||||
|
//
|
||||||
|
// Slice 8 (t-paliad-349) lands this model with two real consumers: the
|
||||||
|
// Markdown importer (pkg/docforge/markdown) and the .docx renderer
|
||||||
|
// (pkg/docforge/docx), which the shipped submission walker now routes
|
||||||
|
// through — so there is one parser, not two.
|
||||||
|
|
||||||
|
// BlockKind is the logical kind of a block. Its string values are the
|
||||||
|
// stylemap keys a format exporter looks up (paragraph, heading_1, …), so
|
||||||
|
// the docx exporter maps Kind → Word paragraph style directly.
|
||||||
|
type BlockKind string
|
||||||
|
|
||||||
|
const (
|
||||||
|
KindParagraph BlockKind = "paragraph"
|
||||||
|
KindHeading1 BlockKind = "heading_1"
|
||||||
|
KindHeading2 BlockKind = "heading_2"
|
||||||
|
KindHeading3 BlockKind = "heading_3"
|
||||||
|
KindListBullet BlockKind = "list_bullet"
|
||||||
|
KindListNumbered BlockKind = "list_numbered"
|
||||||
|
KindBlockquote BlockKind = "blockquote"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Document is a sequence of blocks — the whole editable content.
|
||||||
|
type Document struct {
|
||||||
|
Blocks []Block
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block is one paragraph-level unit. Spans is its inline content; an empty
|
||||||
|
// Spans slice is an intentional empty paragraph (vertical spacing).
|
||||||
|
type Block struct {
|
||||||
|
Kind BlockKind
|
||||||
|
Spans []InlineSpan
|
||||||
|
}
|
||||||
|
|
||||||
|
// InlineSpan is one run of inline content. A span is either:
|
||||||
|
// - literal text with optional bold/italic (Link == "", Children nil), or
|
||||||
|
// - a hyperlink (Link != "") whose label is the Children spans.
|
||||||
|
//
|
||||||
|
// Modelling a link as a span with Children (rather than a per-span Link
|
||||||
|
// flag) preserves link boundaries: two adjacent links to the same URL stay
|
||||||
|
// two distinct hyperlink spans, so the exporter emits them byte-identically
|
||||||
|
// to the pre-model walker.
|
||||||
|
type InlineSpan struct {
|
||||||
|
Text string
|
||||||
|
Bold bool
|
||||||
|
Italic bool
|
||||||
|
Link string // non-empty → this span is a hyperlink to Link
|
||||||
|
Children []InlineSpan // hyperlink label content (only when Link != "")
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user