The final slice: land the format-neutral document model with REAL consumers
and unify the Markdown parser — no duplication, byte-identical output.
Neutral model (pkg/docforge/model.go): Document / Block / InlineSpan.
BlockKind values are the stylemap keys. A hyperlink is a span with Link set
+ Children (the label's spans), preserving link boundaries so adjacent
same-URL links stay distinct — byte-exact with the pre-model walker.
Markdown importer (pkg/docforge/markdown): Import(md) → Document. The SINGLE
Markdown parser for docforge — block split, marker detection, inline
bold/italic/link tokenisation, {{placeholder}} pass-through (the b78a984
fix). Relocated out of the docx walker.
docx renderer (pkg/docforge/docx/markdown.go): now RENDERS a Document →
OOXML (RenderDocumentToOOXML); RenderMarkdownToOOXML[WithStyles] = render(
markdown.Import(md)). The shipped submission walker routes through the model,
so there is one parser, not two. The comprehensive byte-exact render tests
(RenderMarkdownToOOXML_*) all PASS unchanged = output identical.
Exporter interface (pkg/docforge/exporter.go, PRD §4 B4): Exporter{Format,
MIMEType, RenderBody(Document)} with the .docx impl (pkg/docforge/docx/
exporter.go). The seam a future PDF/HTML exporter slots into.
Tests: parser tests relocated to the markdown pkg (parseSpans/detectBlockMarker)
+ new importer Document tests + exporter conformance test.
Verification: go build/vet clean; gofmt clean; full NO-DB test suite GREEN
(authoritative — proves no regression); docforge byte-exact render oracle
PASS; composer live test renders through the rewired walker (PASS); bun build
+ bun test 274/274. The shared-DB live run fails ~85 tests across unrelated
services from a harness pq-42P08 $1-type seeding quirk + a stale
deadline_rules test — systemic/environmental (the no-DB run is clean), not
this change.
docforge train complete: 8 slices, the engine extracted + cleaned + a working
author→generate→export loop on uploaded templates, plus the neutral model +
importer + exporter seam for future formats/consumers.
m/paliad#157
279 lines
9.8 KiB
Go
279 lines
9.8 KiB
Go
package docx
|
|
|
|
// Unit tests for the Composer's Markdown → OOXML walker (t-paliad-313
|
|
// Slice B). Pure function; no DB dependency.
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
)
|
|
|
|
func TestRenderMarkdownToOOXML_EmptyInput(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("", "Normal")
|
|
if !strings.Contains(out, `<w:p>`) {
|
|
t.Errorf("empty input must still emit one <w:p>; got %q", out)
|
|
}
|
|
if !strings.Contains(out, `<w:pStyle w:val="Normal"/>`) {
|
|
t.Errorf("empty input must carry the paragraph style; got %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_SingleParagraph(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("Hello world", "HLpat-Body-B0")
|
|
if !strings.Contains(out, `<w:pStyle w:val="HLpat-Body-B0"/>`) {
|
|
t.Errorf("paragraph missing stylemap entry: %q", out)
|
|
}
|
|
if !strings.Contains(out, "Hello world") {
|
|
t.Errorf("paragraph text missing: %q", out)
|
|
}
|
|
// Exactly one <w:p>.
|
|
if got := strings.Count(out, "<w:p>"); got != 1 {
|
|
t.Errorf("expected 1 <w:p>; got %d", got)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_TwoParagraphs(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("first\n\nsecond", "Normal")
|
|
if got := strings.Count(out, "<w:p>"); got != 2 {
|
|
t.Errorf("expected 2 <w:p>; got %d, out=%q", got, out)
|
|
}
|
|
if !strings.Contains(out, "first") || !strings.Contains(out, "second") {
|
|
t.Errorf("paragraph text missing: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_BoldInline(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("hello **bold** world", "")
|
|
if !strings.Contains(out, `<w:rPr><w:b/></w:rPr>`) {
|
|
t.Errorf("bold rPr missing: %q", out)
|
|
}
|
|
if !strings.Contains(out, ">bold<") {
|
|
t.Errorf("bold text payload missing: %q", out)
|
|
}
|
|
// The surrounding "hello " and " world" pieces are separate runs;
|
|
// the bold rPr should appear exactly once in this output.
|
|
if got := strings.Count(out, "<w:b/>"); got != 1 {
|
|
t.Errorf("expected exactly one <w:b/> tag; got %d in %q", got, out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_ItalicInline(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("see *italic* here", "")
|
|
if !strings.Contains(out, `<w:rPr><w:i/></w:rPr>`) {
|
|
t.Errorf("italic rPr missing: %q", out)
|
|
}
|
|
if !strings.Contains(out, ">italic<") {
|
|
t.Errorf("italic text payload missing: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_BoldItalicCombo(t *testing.T) {
|
|
// Nested: ***both*** → entering both flags. The walker toggles each
|
|
// delimiter independently, so the resulting run carries both <w:b/>
|
|
// and <w:i/>.
|
|
out := RenderMarkdownToOOXML("***both***", "")
|
|
if !strings.Contains(out, `<w:b/>`) || !strings.Contains(out, `<w:i/>`) {
|
|
t.Errorf("expected both <w:b/> and <w:i/>; got %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_PlaceholdersPassThrough(t *testing.T) {
|
|
// Placeholders are sacred — the walker must preserve them verbatim
|
|
// so the v1 placeholder pass can substitute them later.
|
|
out := RenderMarkdownToOOXML("Sehr geehrter {{parties.claimant.0.name}}", "Normal")
|
|
if !strings.Contains(out, "{{parties.claimant.0.name}}") {
|
|
t.Errorf("placeholder corrupted: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_PlaceholderUnderscoresPreserved(t *testing.T) {
|
|
// Regression: a placeholder key containing underscores (project.case_number,
|
|
// user.display_name, project.patent_number_upc) used to get its underscores
|
|
// consumed by the italic/bold inline scanner — the OOXML stored
|
|
// {{project.casenumber}} and the preview surfaced
|
|
// [KEIN WERT: project.casenumber] instead of the real value.
|
|
cases := []string{
|
|
"{{project.case_number}}",
|
|
"{{user.display_name}}",
|
|
"{{project.patent_number_upc}}",
|
|
"prefix {{project.case_number}} suffix",
|
|
"two: {{a.b_c}} and {{d.e_f}}",
|
|
"mixed: _italic_ then {{project.case_number}} then __bold__",
|
|
}
|
|
for _, in := range cases {
|
|
out := RenderMarkdownToOOXML(in, "Normal")
|
|
// Every placeholder substring in the input must appear verbatim
|
|
// in the output (XML escaping is irrelevant for {} and _).
|
|
for _, ph := range extractPlaceholders(in) {
|
|
if !strings.Contains(out, ph) {
|
|
t.Errorf("input %q: placeholder %q lost; got %q", in, ph, out)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// extractPlaceholders pulls every {{...}} occurrence out of a Markdown
|
|
// source. Tiny helper, only used by the regression test above.
|
|
func extractPlaceholders(s string) []string {
|
|
var out []string
|
|
for {
|
|
start := strings.Index(s, "{{")
|
|
if start < 0 {
|
|
return out
|
|
}
|
|
end := strings.Index(s[start+2:], "}}")
|
|
if end < 0 {
|
|
return out
|
|
}
|
|
out = append(out, s[start:start+2+end+2])
|
|
s = s[start+2+end+2:]
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_XMLEscape(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("a & b < c > d", "")
|
|
if strings.Contains(out, " & ") {
|
|
t.Errorf("unescaped & survived: %q", out)
|
|
}
|
|
if !strings.Contains(out, "&") || !strings.Contains(out, "<") || !strings.Contains(out, ">") {
|
|
t.Errorf("expected escaped entities; got %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_BlankLinesPreserveSpacing(t *testing.T) {
|
|
// Two blank lines between paragraphs → one empty paragraph in
|
|
// between, preserving the lawyer's intentional whitespace.
|
|
out := RenderMarkdownToOOXML("first\n\n\nsecond", "Normal")
|
|
if got := strings.Count(out, "<w:p>"); got != 3 {
|
|
t.Errorf("expected 3 <w:p> (first + blank + second); got %d in %q", got, out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_CRLFNormalisation(t *testing.T) {
|
|
out := RenderMarkdownToOOXML("first\r\n\r\nsecond", "")
|
|
if got := strings.Count(out, "<w:p>"); got != 2 {
|
|
t.Errorf("CRLF input should produce 2 paragraphs; got %d in %q", got, out)
|
|
}
|
|
}
|
|
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
// Slice D — rich-prose constructs
|
|
// ─────────────────────────────────────────────────────────────────────
|
|
|
|
func slicedStylemap() map[string]string {
|
|
return map[string]string{
|
|
"paragraph": "Body",
|
|
"heading_1": "H1",
|
|
"heading_2": "H2",
|
|
"heading_3": "H3",
|
|
"list_bullet": "ListBullet",
|
|
"list_numbered": "ListNumber",
|
|
"blockquote": "Quote",
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_Heading1(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("# A heading", slicedStylemap(), nil)
|
|
if !strings.Contains(out, `<w:pStyle w:val="H1"/>`) {
|
|
t.Errorf("heading_1 missing H1 style: %q", out)
|
|
}
|
|
if !strings.Contains(out, "A heading") {
|
|
t.Errorf("heading text missing: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_Heading2And3(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("## H2 line\n### H3 line", slicedStylemap(), nil)
|
|
if !strings.Contains(out, `<w:pStyle w:val="H2"/>`) || !strings.Contains(out, "H2 line") {
|
|
t.Errorf("h2 not rendered: %q", out)
|
|
}
|
|
if !strings.Contains(out, `<w:pStyle w:val="H3"/>`) || !strings.Contains(out, "H3 line") {
|
|
t.Errorf("h3 not rendered: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_BulletList(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("- first\n- second\n* third", slicedStylemap(), nil)
|
|
if !strings.Contains(out, `<w:pStyle w:val="ListBullet"/>`) {
|
|
t.Errorf("bullet stylemap not applied: %q", out)
|
|
}
|
|
if strings.Count(out, "• ") != 3 {
|
|
t.Errorf("expected 3 bullet prefixes; got %d in %q", strings.Count(out, "• "), out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_NumberedList(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("1. first\n2. second\n3. third", slicedStylemap(), nil)
|
|
if !strings.Contains(out, `<w:pStyle w:val="ListNumber"/>`) {
|
|
t.Errorf("numbered stylemap not applied: %q", out)
|
|
}
|
|
for _, want := range []string{"1. ", "2. ", "3. "} {
|
|
if !strings.Contains(out, want) {
|
|
t.Errorf("missing ordinal prefix %q in %q", want, out)
|
|
}
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_NumberedListResetsOnNonList(t *testing.T) {
|
|
// "1. A\n2. B\nplain\n1. C" → 1. A, 2. B, plain para, 1. C
|
|
out := RenderMarkdownToOOXMLWithStyles("1. A\n2. B\nplain\n1. C", slicedStylemap(), nil)
|
|
// The plain "plain" line breaks the list, so the next numbered
|
|
// item restarts at 1.
|
|
idxA := strings.Index(out, "1. ")
|
|
if idxA < 0 {
|
|
t.Fatalf("first 1. missing: %q", out)
|
|
}
|
|
idxB := strings.Index(out, "2. ")
|
|
if idxB < 0 || idxB <= idxA {
|
|
t.Fatalf("2. not after 1.: idxA=%d idxB=%d", idxA, idxB)
|
|
}
|
|
rest := out[idxB+1:]
|
|
idxC := strings.Index(rest, "1. ")
|
|
if idxC < 0 {
|
|
t.Errorf("numbered counter didn't reset on non-list block: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_Blockquote(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("> the quoted text", slicedStylemap(), nil)
|
|
if !strings.Contains(out, `<w:pStyle w:val="Quote"/>`) {
|
|
t.Errorf("blockquote stylemap not applied: %q", out)
|
|
}
|
|
if !strings.Contains(out, "the quoted text") {
|
|
t.Errorf("blockquote text missing: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_Hyperlink(t *testing.T) {
|
|
allocated := map[string]string{}
|
|
alloc := func(url string) string {
|
|
rid := "rIdComposer" + url
|
|
allocated[url] = rid
|
|
return rid
|
|
}
|
|
out := RenderMarkdownToOOXMLWithStyles("See [Bundesgerichtshof](https://bgh.bund.de) for details.", slicedStylemap(), alloc)
|
|
if _, ok := allocated["https://bgh.bund.de"]; !ok {
|
|
t.Errorf("allocator never called for URL: %q", out)
|
|
}
|
|
if !strings.Contains(out, `<w:hyperlink r:id="rIdComposerhttps://bgh.bund.de">`) {
|
|
t.Errorf("hyperlink tag missing or wrong rid: %q", out)
|
|
}
|
|
if !strings.Contains(out, "Bundesgerichtshof") {
|
|
t.Errorf("link label missing: %q", out)
|
|
}
|
|
if !strings.Contains(out, `<w:rStyle w:val="Hyperlink"/>`) {
|
|
t.Errorf("hyperlink character style missing: %q", out)
|
|
}
|
|
}
|
|
|
|
func TestRenderMarkdownToOOXML_HyperlinkNilAllocatorFallsBackToPlain(t *testing.T) {
|
|
out := RenderMarkdownToOOXMLWithStyles("See [BGH](https://bgh.bund.de) here.", slicedStylemap(), nil)
|
|
// Without an allocator, the label still renders as plain text.
|
|
if !strings.Contains(out, "BGH") {
|
|
t.Errorf("label dropped: %q", out)
|
|
}
|
|
if strings.Contains(out, "<w:hyperlink") {
|
|
t.Errorf("hyperlink emitted without allocator: %q", out)
|
|
}
|
|
}
|