paliad/pkg/docforge/markdown/importer.go

// Package markdown imports Markdown source into the neutral
// docforge.Document model (PRD §3.2 / §4 P4 — Markdown is the primary
// input format). It is the single Markdown parser for docforge: the .docx
// renderer consumes the Document this produces, so block-splitting and
// inline tokenisation live here, not in the format adapter.
//
// Grammar (intentionally narrow — unrecognised syntax flows through as a
// plain paragraph, so lawyer prose never errors):
//
//	blank line             → paragraph break
//	# / ## / ### Heading   → heading_1 / 2 / 3
//	- item  / * item       → bullet list item
//	N. item / N) item      → numbered list item
//	> quote                → blockquote
//	**x** / __x__          → bold
//	*x* / _x_              → italic
//	[label](url)           → hyperlink
//	{{key}}                → preserved verbatim (substituted downstream)
package markdown

import (
	"strings"

	"mgit.msbls.de/m/paliad/pkg/docforge"
)

// Import parses Markdown into a Document. Empty (or all-blank) input yields
// a single empty paragraph so a splice site stays well-formed.
func Import(md string) docforge.Document {
	blocks := splitBlocks(md)
	if len(blocks) == 0 {
		return docforge.Document{Blocks: []docforge.Block{{Kind: docforge.KindParagraph}}}
	}
	out := make([]docforge.Block, 0, len(blocks))
	for _, blk := range blocks {
		b := docforge.Block{Kind: docforge.BlockKind(blk.kind)}
		// An empty-text block is an intentional empty paragraph: leave
		// Spans nil so the exporter emits a single empty run.
		if blk.text != "" {
			b.Spans = parseInline(blk.text)
		}
		out = append(out, b)
	}
	return docforge.Document{Blocks: out}
}

// rawBlock is the intermediate (kind, stripped-text) form before inline
// parsing. kind values match docforge.BlockKind string values.
type rawBlock struct {
	kind string
	text string
}

// splitBlocks parses the source into a sequence of (kind, text) blocks,
// detecting heading / list / blockquote prefixes line-by-line. A run of
// unmarked lines collapses into one paragraph block (soft line breaks
// inside a paragraph concatenate); each marked line is its own block.
// Blank-run spacing emits extra empty paragraph blocks. CRLF normalised.
func splitBlocks(md string) []rawBlock {
	normalised := strings.ReplaceAll(md, "\r\n", "\n")
	lines := strings.Split(normalised, "\n")
	var blocks []rawBlock
	var pendingPara []string
	blankRun := 0

	flushPara := func() {
		if len(pendingPara) > 0 {
			blocks = append(blocks, rawBlock{kind: "paragraph", text: strings.Join(pendingPara, "\n")})
			pendingPara = nil
		}
	}

	for _, line := range lines {
		if strings.TrimSpace(line) == "" {
			if len(pendingPara) > 0 {
				flushPara()
				blankRun = 1
				continue
			}
			blankRun++
			continue
		}
		if kind, payload, ok := detectBlockMarker(line); ok {
			flushPara()
			for i := 1; i < blankRun; i++ {
				blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
			}
			blankRun = 0
			blocks = append(blocks, rawBlock{kind: kind, text: payload})
			continue
		}
		if len(pendingPara) == 0 {
			for i := 1; i < blankRun; i++ {
				blocks = append(blocks, rawBlock{kind: "paragraph", text: ""})
			}
		}
		blankRun = 0
		pendingPara = append(pendingPara, line)
	}
	flushPara()
	return blocks
}

// detectBlockMarker classifies a single line. Tolerates up to 3 leading
// spaces (CommonMark) before treating the line as a plain paragraph.
func detectBlockMarker(line string) (kind, payload string, ok bool) {
	trimmed := strings.TrimLeft(line, " ")
	if len(line)-len(trimmed) > 3 {
		return "", "", false
	}
	switch {
	case strings.HasPrefix(trimmed, "### "):
		return "heading_3", strings.TrimSpace(trimmed[4:]), true
	case strings.HasPrefix(trimmed, "## "):
		return "heading_2", strings.TrimSpace(trimmed[3:]), true
	case strings.HasPrefix(trimmed, "# "):
		return "heading_1", strings.TrimSpace(trimmed[2:]), true
	case strings.HasPrefix(trimmed, "> "):
		return "blockquote", strings.TrimSpace(trimmed[2:]), true
	case strings.HasPrefix(trimmed, "- "), strings.HasPrefix(trimmed, "* "):
		return "list_bullet", strings.TrimSpace(trimmed[2:]), true
	}
	if i := indexOfNumberedMarker(trimmed); i > 0 {
		return "list_numbered", strings.TrimSpace(trimmed[i:]), true
	}
	return "", "", false
}

// indexOfNumberedMarker returns the byte index just past an "N. " / "N) "
// marker at the start of s, or -1 when absent.
func indexOfNumberedMarker(s string) int {
	i := 0
	for i < len(s) && s[i] >= '0' && s[i] <= '9' {
		i++
	}
	if i == 0 || i >= len(s) {
		return -1
	}
	if s[i] != '.' && s[i] != ')' {
		return -1
	}
	if i+1 >= len(s) || s[i+1] != ' ' {
		return -1
	}
	return i + 2
}

// parseInline splits text around [label](url) hyperlinks and tokenises the
// rest into bold/italic spans. Hyperlinks become a span with Link set and
// the label's spans as Children, preserving link boundaries.
func parseInline(text string) []docforge.InlineSpan {
	var out []docforge.InlineSpan
	rest := text
	for {
		idx := strings.Index(rest, "[")
		if idx < 0 {
			if rest != "" {
				out = append(out, parseSpans(rest)...)
			}
			break
		}
		closeBracket := strings.Index(rest[idx:], "](")
		if closeBracket < 0 {
			out = append(out, parseSpans(rest)...)
			break
		}
		closeParen := strings.Index(rest[idx+closeBracket:], ")")
		if closeParen < 0 {
			out = append(out, parseSpans(rest)...)
			break
		}
		label := rest[idx+1 : idx+closeBracket]
		url := rest[idx+closeBracket+2 : idx+closeBracket+closeParen]
		if idx > 0 {
			out = append(out, parseSpans(rest[:idx])...)
		}
		out = append(out, docforge.InlineSpan{Link: url, Children: parseSpans(label)})
		rest = rest[idx+closeBracket+closeParen+1:]
	}
	return out
}

// parseSpans tokenises Markdown inline bold/italic into spans, preserving
// {{...}} placeholders verbatim (the b78a984 fix — underscores in a
// placeholder key must not be read as italic delimiters). Empty input
// yields one empty span.
func parseSpans(text string) []docforge.InlineSpan {
	var out []docforge.InlineSpan
	var cur strings.Builder
	bold := false
	italic := false
	flush := func() {
		if cur.Len() == 0 {
			return
		}
		out = append(out, docforge.InlineSpan{Text: cur.String(), Bold: bold, Italic: italic})
		cur.Reset()
	}
	i := 0
	n := len(text)
	for i < n {
		if i+1 < n && text[i] == '{' && text[i+1] == '{' {
			if rel := strings.Index(text[i+2:], "}}"); rel >= 0 {
				end := i + 2 + rel + 2
				cur.WriteString(text[i:end])
				i = end
				continue
			}
		}
		if i+1 < n && (text[i:i+2] == "**" || text[i:i+2] == "__") {
			flush()
			bold = !bold
			i += 2
			continue
		}
		if text[i] == '*' || text[i] == '_' {
			flush()
			italic = !italic
			i++
			continue
		}
		cur.WriteByte(text[i])
		i++
	}
	flush()
	if len(out) == 0 {
		out = append(out, docforge.InlineSpan{Text: ""})
	}
	return out
}