Files
paliad/pkg/docforge/markdown/importer.go
mAi 8763ab013c feat(docforge): slice 8 — neutral model + Markdown importer + Exporter iface (t-paliad-349)
The final slice: land the format-neutral document model with REAL consumers
and unify the Markdown parser — no duplication, byte-identical output.

Neutral model (pkg/docforge/model.go): Document / Block / InlineSpan.
BlockKind values are the stylemap keys. A hyperlink is a span with Link set
+ Children (the label's spans), preserving link boundaries so adjacent
same-URL links stay distinct — byte-exact with the pre-model walker.

Markdown importer (pkg/docforge/markdown): Import(md) → Document. The SINGLE
Markdown parser for docforge — block split, marker detection, inline
bold/italic/link tokenisation, {{placeholder}} pass-through (the b78a984
fix). Relocated out of the docx walker.

docx renderer (pkg/docforge/docx/markdown.go): now RENDERS a Document →
OOXML (RenderDocumentToOOXML); RenderMarkdownToOOXML[WithStyles] = render(
markdown.Import(md)). The shipped submission walker routes through the model,
so there is one parser, not two. The comprehensive byte-exact render tests
(RenderMarkdownToOOXML_*) all PASS unchanged = output identical.

Exporter interface (pkg/docforge/exporter.go, PRD §4 B4): Exporter{Format,
MIMEType, RenderBody(Document)} with the .docx impl (pkg/docforge/docx/
exporter.go). The seam a future PDF/HTML exporter slots into.

Tests: parser tests relocated to the markdown pkg (parseSpans/detectBlockMarker)
+ new importer Document tests + exporter conformance test.

Verification: go build/vet clean; gofmt clean; full NO-DB test suite GREEN
(authoritative — proves no regression); docforge byte-exact render oracle
PASS; composer live test renders through the rewired walker (PASS); bun build
+ bun test 274/274. The shared-DB live run fails ~85 tests across unrelated
services from a harness pq-42P08 $1-type seeding quirk + a stale
deadline_rules test — systemic/environmental (the no-DB run is clean), not
this change.

docforge train complete: 8 slices, the engine extracted + cleaned + a working
author→generate→export loop on uploaded templates, plus the neutral model +
importer + exporter seam for future formats/consumers.

m/paliad#157
2026-05-29 18:10:16 +02:00

231 lines
6.7 KiB
Go

// Package markdown imports Markdown source into the neutral
// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary
// input format). It is the single Markdown parser for docforge: the .docx
// renderer consumes the Document this produces, so block-splitting and
// inline tokenisation live here, not in the format adapter.
//
// Grammar (intentionally narrow — unrecognised syntax flows through as a
// plain paragraph, so lawyer prose never errors):
//
// blank line → paragraph break
// # / ## / ### Heading → heading_1 / 2 / 3
// - item / * item → bullet list item
// N. item / N) item → numbered list item
// > quote → blockquote
// **x** / __x__ → bold
// *x* / _x_ → italic
// [label](url) → hyperlink
// {{key}} → preserved verbatim (substituted downstream)
package markdown
import (
"strings"
"mgit.msbls.de/m/paliad/pkg/docforge"
)
// Import parses Markdown into a Document. Empty (or all-blank) input yields
// a single empty paragraph so a splice site stays well-formed.
func Import(md string) docforge.Document {
blocks := splitBlocks(md)
if len(blocks) == 0 {
return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}}
}
out := make([]docforge.Block, 0, len(blocks))
for _, blk := range blocks {
b := docforge.Block{Kind: docforge.BlockKind(blk.kind)}
// An empty-text block is an intentional empty paragraph: leave
// Spans nil so the exporter emits a single empty run.
if blk.text != "" {
b.Spans = parseInline(blk.text)
}
out = append(out, b)
}
return docforge.Document{Blocks: out}
}
// rawBlock is the intermediate (kind, stripped-text) form before inline
// parsing. kind values match docforge.BlockKind string values.
type rawBlock struct {
kind string
text string
}
// splitBlocks parses the source into a sequence of (kind, text) blocks,
// detecting heading / list / blockquote prefixes line-by-line. A run of
// unmarked lines collapses into one paragraph block (soft line breaks
// inside a paragraph concatenate); each marked line is its own block.
// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised.
func splitBlocks(md string) []rawBlock {
normalised := strings.ReplaceAll(md, "\r\n", "\n")
lines := strings.Split(normalised, "\n")
var blocks []rawBlock
var pendingPara []string
blankRun := 0
flushPara := func() {
if len(pendingPara) > 0 {
blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")})
pendingPara = nil
}
}
for _, line := range lines {
if strings.TrimSpace(line) == "" {
if len(pendingPara) > 0 {
flushPara()
blankRun = 1
continue
}
blankRun++
continue
}
if kind, payload, ok := detectBlockMarker(line); ok {
flushPara()
for i := 1; i < blankRun; i++ {
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
}
blankRun = 0
blocks = append(blocks, rawBlock{kind: kind, text: payload})
continue
}
if len(pendingPara) == 0 {
for i := 1; i < blankRun; i++ {
blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
}
}
blankRun = 0
pendingPara = append(pendingPara, line)
}
flushPara()
return blocks
}
// detectBlockMarker classifies a single line. Tolerates up to 3 leading
// spaces (CommonMark) before treating the line as a plain paragraph.
func detectBlockMarker(line string) (kind, payload string, ok bool) {
trimmed := strings.TrimLeft(line, " ")
if len(line)-len(trimmed) > 3 {
return "", "", false
}
switch {
case strings.HasPrefix(trimmed, "### "):
return "heading_3", strings.TrimSpace(trimmed[4:]), true
case strings.HasPrefix(trimmed, "## "):
return "heading_2", strings.TrimSpace(trimmed[3:]), true
case strings.HasPrefix(trimmed, "# "):
return "heading_1", strings.TrimSpace(trimmed[2:]), true
case strings.HasPrefix(trimmed, "> "):
return "blockquote", strings.TrimSpace(trimmed[2:]), true
case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "):
return "list_bullet", strings.TrimSpace(trimmed[2:]), true
}
if i := indexOfNumberedMarker(trimmed); i > 0 {
return "list_numbered", strings.TrimSpace(trimmed[i:]), true
}
return "", "", false
}
// indexOfNumberedMarker returns the byte index just past an "N. " / "N) "
// marker at the start of s, or -1 when absent.
func indexOfNumberedMarker(s string) int {
i := 0
for i < len(s) && s[i] >= '0' && s[i] <= '9' {
i++
}
if i == 0 || i >= len(s) {
return -1
}
if s[i] != '.' && s[i] != ')' {
return -1
}
if i+1 >= len(s) || s[i+1] != ' ' {
return -1
}
return i + 2
}
// parseInline splits text around [label](url) hyperlinks and tokenises the
// rest into bold/italic spans. Hyperlinks become a span with Link set and
// the label's spans as Children, preserving link boundaries.
func parseInline(text string) []docforge.InlineSpan {
var out []docforge.InlineSpan
rest := text
for {
idx := strings.Index(rest, "[")
if idx < 0 {
if rest != "" {
out = append(out, parseSpans(rest)...)
}
break
}
closeBracket := strings.Index(rest[idx:], "](")
if closeBracket < 0 {
out = append(out, parseSpans(rest)...)
break
}
closeParen := strings.Index(rest[idx+closeBracket:], ")")
if closeParen < 0 {
out = append(out, parseSpans(rest)...)
break
}
label := rest[idx+1 : idx+closeBracket]
url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
if idx > 0 {
out = append(out, parseSpans(rest[:idx])...)
}
out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)})
rest = rest[idx+closeBracket+closeParen+1:]
}
return out
}
// parseSpans tokenises Markdown inline bold/italic into spans, preserving
// {{...}} placeholders verbatim (the b78a984 fix — underscores in a
// placeholder key must not be read as italic delimiters). Empty input
// yields one empty span.
func parseSpans(text string) []docforge.InlineSpan {
var out []docforge.InlineSpan
var cur strings.Builder
bold := false
italic := false
flush := func() {
if cur.Len() == 0 {
return
}
out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
cur.Reset()
}
i := 0
n := len(text)
for i < n {
if i+1 < n && text[i] == '{' && text[i+1] == '{' {
if rel := strings.Index(text[i+2:], "}}"); rel >= 0 {
end := i + 2 + rel + 2
cur.WriteString(text[i:end])
i = end
continue
}
}
if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
flush()
bold = !bold
i += 2
continue
}
if text[i] == '*' || text[i] == '_' {
flush()
italic = !italic
i++
continue
}
cur.WriteByte(text[i])
i++
}
flush()
if len(out) == 0 {
out = append(out, docforge.InlineSpan{Text: ""})
}
return out
}