From 1a73ef9668e5b193601353f424382073bcb890e9 Mon Sep 17 00:00:00 2001 From: Abdelrahman Emad Date: Tue, 13 Jan 2026 12:25:57 +0200 Subject: [PATCH 1/3] fix(table): handle HTML table captions correctly Signed-off-by: Abdelrahman Emad --- packages/markdown-html/src/rules.js | 130 ++++++++++++++++++++++++++-- 1 file changed, 122 insertions(+), 8 deletions(-) diff --git a/packages/markdown-html/src/rules.js b/packages/markdown-html/src/rules.js index 2d72f9bd..06bcbb2b 100644 --- a/packages/markdown-html/src/rules.js +++ b/packages/markdown-html/src/rules.js @@ -498,42 +498,155 @@ const HTML_BLOCK_RULE = { } }; +const CAPTION_RULE = { + deserialize(el, next, ignoreSpace) { + if (el.tagName && el.tagName.toLowerCase() === 'caption') { + return { + type: 'caption', + nodes: next(el.childNodes, ignoreSpace) + }; + } + } +}; + +/** + * Clean table cell nodes by removing Softbreaks and normalizing whitespace. + * @param {Array} nodes - the list of nodes + * @returns {Array} - the cleaned list of nodes + */ +function cleanTableNodes(nodes) { + const NS = CommonMarkModel.NAMESPACE; + const TEXT = `${NS}.Text`; + const SOFT = `${NS}.Softbreak`; + + if (!nodes) return []; + nodes = Array.isArray(nodes) ? nodes : [nodes]; + + const merged = nodes.reduce((acc, node) => { + if (!node) return acc; + + let newNode = { ...node }; + if (newNode.nodes) newNode = { ...newNode, nodes: cleanTableNodes(newNode.nodes) }; + + if (newNode.$class === SOFT) { + newNode = { $class: TEXT, text: ' ' }; + } + + const last = acc[acc.length - 1]; + if (last && last.$class === TEXT && newNode.$class === TEXT) + { + last.text += newNode.text; + } + else + { + acc.push(newNode); + } + + return acc; + }, []); + + // Normalize whitespace inside Text nodes + merged.forEach(n => { + if (n.$class === TEXT) n.text = n.text.replace(/\s+/g, ' '); + }); + + + if (merged.length > 0 && merged[0].$class === TEXT) { + merged[0].text = merged[0].text.replace(/^\s+/, ''); + } + if (merged.length > 0 && merged[merged.length - 1].$class === TEXT) { + merged[merged.length - 1].text = merged[merged.length - 1].text.replace(/\s+$/, ''); + } + + return merged.filter(n => n.$class !== TEXT || n.text.length > 0); +} + + const TABLE_RULE = { deserialize(el, next, ignoreSpace) { if (el.tagName && el.tagName.toLowerCase() === 'table') { - return { + const children = next(el.childNodes, ignoreSpace); + const captionNode = children.find(c => c.type === 'caption'); + let tableNodes = children.filter(node => + node.$class === `${CommonMarkModel.NAMESPACE}.TableHead` || + node.$class === `${CommonMarkModel.NAMESPACE}.TableBody` + ); + + let head = tableNodes.find(n => n.$class === `${CommonMarkModel.NAMESPACE}.TableHead`); + const body = tableNodes.find(n => n.$class === `${CommonMarkModel.NAMESPACE}.TableBody`); + + if (!head && body && body.nodes && body.nodes.length > 0) { + const firstRow = body.nodes[0]; + const hasHeaderCells = firstRow.nodes && firstRow.nodes.some(n => n.$class === `${CommonMarkModel.NAMESPACE}.HeaderCell`); + + if (hasHeaderCells) { + head = { + $class: `${CommonMarkModel.NAMESPACE}.TableHead`, + nodes: [firstRow] + }; + const newBody = { + $class: `${CommonMarkModel.NAMESPACE}.TableBody`, + nodes: body.nodes.slice(1) + }; + tableNodes = [head, newBody]; + } + } + + const table = { $class: `${CommonMarkModel.NAMESPACE}.Table`, - nodes: next(el.childNodes), + nodes: tableNodes, }; + + if (captionNode) { + const captionParagraph = { + $class: `${CommonMarkModel.NAMESPACE}.Paragraph`, + nodes: [ + { + $class: `${CommonMarkModel.NAMESPACE}.Strong`, + nodes: cleanTableNodes(captionNode.nodes) + }, + { + $class: `${CommonMarkModel.NAMESPACE}.Text`, + text: '\n\n' + } + ] + }; + return [captionParagraph, table]; + } + + return table; } if (el.tagName && el.tagName.toLowerCase() === 'thead') { + const nodes = next(el.childNodes); return { $class: `${CommonMarkModel.NAMESPACE}.TableHead`, - nodes: next(el.childNodes), + nodes: nodes.filter(n => n.$class === `${CommonMarkModel.NAMESPACE}.TableRow`), }; } if (el.tagName && el.tagName.toLowerCase() === 'tbody') { + const nodes = next(el.childNodes); return { $class: `${CommonMarkModel.NAMESPACE}.TableBody`, - nodes: next(el.childNodes), + nodes: nodes.filter(n => n.$class === `${CommonMarkModel.NAMESPACE}.TableRow`), }; } if (el.tagName && el.tagName.toLowerCase() === 'tr') { + const nodes = next(el.childNodes); return { $class: `${CommonMarkModel.NAMESPACE}.TableRow`, - nodes: next(el.childNodes), + nodes: nodes.filter(n => n.$class === `${CommonMarkModel.NAMESPACE}.HeaderCell` || n.$class === `${CommonMarkModel.NAMESPACE}.TableCell`), }; } if (el.tagName && el.tagName.toLowerCase() === 'th') { return { $class: `${CommonMarkModel.NAMESPACE}.HeaderCell`, - nodes: next(el.childNodes), + nodes: cleanTableNodes(next(el.childNodes)), }; } if (el.tagName && el.tagName.toLowerCase() === 'td') { return { $class: `${CommonMarkModel.NAMESPACE}.TableCell`, - nodes: next(el.childNodes), + nodes: cleanTableNodes(next(el.childNodes)), }; } }, @@ -560,7 +673,8 @@ const rules = [ HTML_INLINE_RULE, HTML_BLOCK_RULE, IMAGE_RULE, - TABLE_RULE + TABLE_RULE, + CAPTION_RULE ]; From 37d34945c53cbdeb857d90fc3b6bfa24cec37b10 Mon Sep 17 00:00:00 2001 From: Abdelrahman Emad Date: Tue, 2 Jun 2026 03:03:25 +0300 Subject: [PATCH 2/3] fix(html): add table caption regression tests Signed-off-by: Abdelrahman Emad --- .../markdown-html/src/HtmlTransformer.test.js | 100 ++++++++++++++++++ packages/markdown-html/src/rules.js | 76 ++++++------- 2 files changed, 140 insertions(+), 36 deletions(-) diff --git a/packages/markdown-html/src/HtmlTransformer.test.js b/packages/markdown-html/src/HtmlTransformer.test.js index e9898a35..775d6936 100644 --- a/packages/markdown-html/src/HtmlTransformer.test.js +++ b/packages/markdown-html/src/HtmlTransformer.test.js @@ -18,6 +18,7 @@ const fs = require('fs'); const CiceroMarkTransformer = require('@accordproject/markdown-cicero').CiceroMarkTransformer; +const { CommonMarkModel } = require('@accordproject/markdown-common'); const HtmlTransformer = require('./HtmlTransformer'); let htmlTransformer = null; @@ -78,6 +79,105 @@ describe('markdown <-> html', () => { }); }); +describe('html table deserialization', () => { + const commonmarkNamespace = CommonMarkModel.NAMESPACE; + + it('deserializes a table caption before the table', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark(` + + + + + + + + +
Employee Details
NameRole
AdaEngineer
+ `); + + expect(ciceroMarkDom.nodes).toHaveLength(2); + expect(ciceroMarkDom.nodes[0]).toEqual({ + $class: `${commonmarkNamespace}.Paragraph`, + nodes: [ + { + $class: `${commonmarkNamespace}.Strong`, + nodes: [ + { + $class: `${commonmarkNamespace}.Text`, + text: 'Employee Details' + } + ] + }, + { + $class: `${commonmarkNamespace}.Text`, + text: '\n\n' + } + ] + }); + expect(ciceroMarkDom.nodes[1].$class).toBe(`${commonmarkNamespace}.Table`); + }); + + it('normalizes whitespace in table cells', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark(` + + + + + + +
+ First + Second +
+ `); + + const cellNodes = ciceroMarkDom.nodes[0].nodes[0].nodes[0].nodes[0].nodes; + expect(cellNodes).toEqual([ + { + $class: `${commonmarkNamespace}.Text`, + text: 'First Second' + } + ]); + }); + + it('promotes tbody rows with header cells to table head when thead is missing', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark(` + + + + + +
NameRole
AdaEngineer
+ `); + + const table = ciceroMarkDom.nodes[0]; + expect(table.nodes).toHaveLength(2); + expect(table.nodes[0].$class).toBe(`${commonmarkNamespace}.TableHead`); + expect(table.nodes[0].nodes[0].nodes.map(cell => cell.$class)).toEqual([ + `${commonmarkNamespace}.HeaderCell`, + `${commonmarkNamespace}.HeaderCell` + ]); + expect(table.nodes[1].$class).toBe(`${commonmarkNamespace}.TableBody`); + expect(table.nodes[1].nodes).toHaveLength(1); + }); + + it('keeps tables without captions as a single table node', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark(` + + + + + + + +
NameRole
AdaEngineer
+ `); + + expect(ciceroMarkDom.nodes).toHaveLength(1); + expect(ciceroMarkDom.nodes[0].$class).toBe(`${commonmarkNamespace}.Table`); + }); +}); + /** * Get the name and contents of all ciceromark test files * @returns {*} an array of name/contents tuples diff --git a/packages/markdown-html/src/rules.js b/packages/markdown-html/src/rules.js index 06bcbb2b..349d9195 100644 --- a/packages/markdown-html/src/rules.js +++ b/packages/markdown-html/src/rules.js @@ -515,50 +515,54 @@ const CAPTION_RULE = { * @returns {Array} - the cleaned list of nodes */ function cleanTableNodes(nodes) { - const NS = CommonMarkModel.NAMESPACE; - const TEXT = `${NS}.Text`; - const SOFT = `${NS}.Softbreak`; + const NS = CommonMarkModel.NAMESPACE; + const TEXT = `${NS}.Text`; + const SOFT = `${NS}.Softbreak`; - if (!nodes) return []; - nodes = Array.isArray(nodes) ? nodes : [nodes]; - - const merged = nodes.reduce((acc, node) => { - if (!node) return acc; + if (!nodes) { + return []; + } + nodes = Array.isArray(nodes) ? nodes : [nodes]; - let newNode = { ...node }; - if (newNode.nodes) newNode = { ...newNode, nodes: cleanTableNodes(newNode.nodes) }; + const merged = nodes.reduce((acc, node) => { + if (!node) { + return acc; + } - if (newNode.$class === SOFT) { - newNode = { $class: TEXT, text: ' ' }; - } + let newNode = { ...node }; + if (newNode.nodes) { + newNode = { ...newNode, nodes: cleanTableNodes(newNode.nodes) }; + } - const last = acc[acc.length - 1]; - if (last && last.$class === TEXT && newNode.$class === TEXT) - { - last.text += newNode.text; - } - else - { - acc.push(newNode); - } + if (newNode.$class === SOFT) { + newNode = { $class: TEXT, text: ' ' }; + } - return acc; - }, []); + const last = acc[acc.length - 1]; + if (last && last.$class === TEXT && newNode.$class === TEXT) { + last.text += newNode.text; + } else { + acc.push(newNode); + } - // Normalize whitespace inside Text nodes - merged.forEach(n => { - if (n.$class === TEXT) n.text = n.text.replace(/\s+/g, ' '); - }); + return acc; + }, []); + // Normalize whitespace inside Text nodes + merged.forEach(n => { + if (n.$class === TEXT) { + n.text = n.text.replace(/\s+/g, ' '); + } + }); - if (merged.length > 0 && merged[0].$class === TEXT) { - merged[0].text = merged[0].text.replace(/^\s+/, ''); - } - if (merged.length > 0 && merged[merged.length - 1].$class === TEXT) { - merged[merged.length - 1].text = merged[merged.length - 1].text.replace(/\s+$/, ''); - } + if (merged.length > 0 && merged[0].$class === TEXT) { + merged[0].text = merged[0].text.replace(/^\s+/, ''); + } + if (merged.length > 0 && merged[merged.length - 1].$class === TEXT) { + merged[merged.length - 1].text = merged[merged.length - 1].text.replace(/\s+$/, ''); + } - return merged.filter(n => n.$class !== TEXT || n.text.length > 0); + return merged.filter(n => n.$class !== TEXT || n.text.length > 0); } @@ -678,4 +682,4 @@ const rules = [ ]; -module.exports = rules; \ No newline at end of file +module.exports = rules; From 1130eeebc3f16f2ffd608c51bf7fa54d2f35d366 Mon Sep 17 00:00:00 2001 From: Matt Roberts Date: Tue, 2 Jun 2026 09:13:17 +0100 Subject: [PATCH 3/3] fix(html): keep table caption CommonMark valid Address Copilot review feedback on PR #640: - Flatten content to inline-only nodes before wrapping in Strong, so flow content (e.g. nested

, lists) can no longer become invalid block children of a Strong node. - Handle inside the table branch instead of a standalone rule, so a caption appearing outside a table no longer leaks a node without a $class into the output. - Emit the bolded caption as its own Paragraph block and drop the inline '\n\n' Text hack, leaving block-level spacing to the Markdown serializer. Co-Authored-By: Claude Opus 4.7 Signed-off-by: Matt Roberts --- .../markdown-html/src/HtmlTransformer.test.js | 42 +++++++++- packages/markdown-html/src/rules.js | 83 +++++++++++++------ 2 files changed, 94 insertions(+), 31 deletions(-) diff --git a/packages/markdown-html/src/HtmlTransformer.test.js b/packages/markdown-html/src/HtmlTransformer.test.js index 775d6936..70298585 100644 --- a/packages/markdown-html/src/HtmlTransformer.test.js +++ b/packages/markdown-html/src/HtmlTransformer.test.js @@ -107,16 +107,50 @@ describe('html table deserialization', () => { text: 'Employee Details' } ] - }, - { - $class: `${commonmarkNamespace}.Text`, - text: '\n\n' } ] }); expect(ciceroMarkDom.nodes[1].$class).toBe(`${commonmarkNamespace}.Table`); }); + it('flattens block content in a caption to inline-only Strong children', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark(` + + + + + + + + +

See also

Name
Ada
+ `); + + expect(ciceroMarkDom.nodes).toHaveLength(2); + const strong = ciceroMarkDom.nodes[0].nodes[0]; + expect(strong.$class).toBe(`${commonmarkNamespace}.Strong`); + // every child of Strong must be inline (no Paragraph leaked in) + strong.nodes.forEach(child => { + expect(child.$class).not.toBe(`${commonmarkNamespace}.Paragraph`); + }); + expect(strong.nodes).toEqual([ + { $class: `${commonmarkNamespace}.Text`, text: 'See ' }, + { + $class: `${commonmarkNamespace}.Emph`, + nodes: [{ $class: `${commonmarkNamespace}.Text`, text: 'also' }] + } + ]); + expect(ciceroMarkDom.nodes[1].$class).toBe(`${commonmarkNamespace}.Table`); + }); + + it('does not leak a caption node when it appears outside a table', () => { + const ciceroMarkDom = htmlTransformer.toCiceroMark('Orphan caption'); + + // the old caption rule produced a node with no $class; ensure none leak + ciceroMarkDom.nodes.forEach(n => expect(typeof n.$class).toBe('string')); + expect(ciceroMarkDom.nodes.some(n => n.type === 'caption')).toBe(false); + }); + it('normalizes whitespace in table cells', () => { const ciceroMarkDom = htmlTransformer.toCiceroMark(` diff --git a/packages/markdown-html/src/rules.js b/packages/markdown-html/src/rules.js index 349d9195..08547977 100644 --- a/packages/markdown-html/src/rules.js +++ b/packages/markdown-html/src/rules.js @@ -498,16 +498,39 @@ const HTML_BLOCK_RULE = { } }; -const CAPTION_RULE = { - deserialize(el, next, ignoreSpace) { - if (el.tagName && el.tagName.toLowerCase() === 'caption') { - return { - type: 'caption', - nodes: next(el.childNodes, ignoreSpace) - }; - } +// CommonMark inline node classes. A
may contain flow content +// (paragraphs, lists, etc.), but a Strong node may only contain inline +// children, so caption content is flattened to these classes before it is +// wrapped in Strong. +const INLINE_CLASSES = [ + 'Text', 'Emph', 'Strong', 'Code', 'Link', 'Image', + 'Softbreak', 'Linebreak', 'HtmlInline' +].map(name => `${CommonMarkModel.NAMESPACE}.${name}`); + +/** + * Flatten caption content to inline-only nodes so it can be safely wrapped in + * a Strong node. Block-level wrappers (e.g. Paragraph) are unwrapped to their + * inline children; nodes with no inline content are dropped. + * @param {Array} nodes - the list of nodes + * @returns {Array} - the inline-only list of nodes + */ +function toInlineNodes(nodes) { + if (!nodes) { + return []; } -}; + const list = Array.isArray(nodes) ? nodes : [nodes]; + return list.reduce((acc, node) => { + if (!node) { + return acc; + } + if (INLINE_CLASSES.includes(node.$class)) { + acc.push(node); + } else if (node.nodes) { + acc.push(...toInlineNodes(node.nodes)); + } + return acc; + }, []); +} /** * Clean table cell nodes by removing Softbreaks and normalizing whitespace. @@ -570,7 +593,6 @@ const TABLE_RULE = { deserialize(el, next, ignoreSpace) { if (el.tagName && el.tagName.toLowerCase() === 'table') { const children = next(el.childNodes, ignoreSpace); - const captionNode = children.find(c => c.type === 'caption'); let tableNodes = children.filter(node => node.$class === `${CommonMarkModel.NAMESPACE}.TableHead` || node.$class === `${CommonMarkModel.NAMESPACE}.TableBody` @@ -601,21 +623,29 @@ const TABLE_RULE = { nodes: tableNodes, }; - if (captionNode) { - const captionParagraph = { - $class: `${CommonMarkModel.NAMESPACE}.Paragraph`, - nodes: [ - { - $class: `${CommonMarkModel.NAMESPACE}.Strong`, - nodes: cleanTableNodes(captionNode.nodes) - }, - { - $class: `${CommonMarkModel.NAMESPACE}.Text`, - text: '\n\n' - } - ] - }; - return [captionParagraph, table]; + // A is handled here (rather than via its own rule) so the + // caption node never leaks into the output when it appears outside + // of a table. Its content is flattened to inline-only nodes so the + // Strong wrapper stays valid CommonMark, and the bolded caption is + // emitted as its own Paragraph block before the table - block-level + // spacing is left to the Markdown serializer. + const captionElement = Array.from(el.childNodes).find( + child => child.tagName && child.tagName.toLowerCase() === 'caption' + ); + if (captionElement) { + const captionNodes = cleanTableNodes(toInlineNodes(next(captionElement.childNodes, ignoreSpace))); + if (captionNodes.length > 0) { + const captionParagraph = { + $class: `${CommonMarkModel.NAMESPACE}.Paragraph`, + nodes: [ + { + $class: `${CommonMarkModel.NAMESPACE}.Strong`, + nodes: captionNodes + } + ] + }; + return [captionParagraph, table]; + } } return table; @@ -677,8 +707,7 @@ const rules = [ HTML_INLINE_RULE, HTML_BLOCK_RULE, IMAGE_RULE, - TABLE_RULE, - CAPTION_RULE + TABLE_RULE ];