feat: fix MarkdownManager to handle mixed Markdown and HTML parsing (#7154)

bdbch · Copilot · web-flow · commit 194af3b6cca3 · 2025-10-31T17:09:31.000+01:00
* feat: enhance MarkdownManager to handle mixed Markdown and HTML parsing

* fix: correct parsing of mixed inline HTML within Markdown content

* Update demos/vite.config.ts

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Update packages/markdown/src/MarkdownManager.ts

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Update packages/markdown/src/MarkdownManager.ts

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* Update tests/cypress/integration/markdown/mixed-html.spec.ts

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;

* fix: escape regex special characters in HTML tag matching

Sanitize tagName before using it in RegExp constructor to prevent
regex compilation errors or unintended matching behavior when tag
names contain special characters.

---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/.changeset/inline-html-fix.md b/.changeset/inline-html-fix.md
@@ -0,0 +1,5 @@
+---
+"@tiptap/markdown": patch
+---
+
+Fix parsing of mixed inline HTML within Markdown content so that inline HTML fragments are parsed correctly.
diff --git a/demos/vite.config.ts b/demos/vite.config.ts
@@ -317,6 +317,7 @@ export default defineConfig({
         return () => {
           viteDevServer.middlewares.use(async (req, res, next) => {
             if (req?.originalUrl?.startsWith('/preview')) {
+              // @ts-expect-error - req.url is not typed but exists at runtime in Vite middleware
               req.url = '/preview/index.html'
             }
 
diff --git a/packages/markdown/src/MarkdownManager.ts b/packages/markdown/src/MarkdownManager.ts
@@ -375,21 +375,95 @@ export class MarkdownManager {
     }
   }
 
+  /**
+   * Escape special regex characters in a string.
+   */
+  private escapeRegex(str: string): string {
+    return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
+  }
+
   /**
    * Parse inline tokens (bold, italic, links, etc.) into text nodes with marks.
    * This is the complex part that handles mark nesting and boundaries.
    */
   private parseInlineTokens(tokens: MarkdownToken[]): JSONContent[] {
     const result: JSONContent[] = []
 
-    // Process tokens sequentially
-    tokens.forEach(token => {
+    // Process tokens sequentially using an index so we can lookahead and
+    // merge split inline HTML fragments like: text / <em> / inner / </em> / text
+    for (let i = 0; i < tokens.length; i += 1) {
+      const token = tokens[i]
+
       if (token.type === 'text') {
         // Create text node
         result.push({
           type: 'text',
           text: token.text || '',
         })
+      } else if (token.type === 'html') {
+        // Handle possible split inline HTML by attempting to detect an
+        // opening tag and searching forward for a matching closing tag.
+        const raw = (token.raw ?? token.text ?? '').toString()
+
+        // Quick checks for opening vs. closing tag
+        const isClosing = /^<\/[\s]*[\w-]+/i.test(raw)
+        const openMatch = raw.match(/^<[\s]*([\w-]+)(\s|>|\/|$)/i)
+
+        if (!isClosing && openMatch && !/\/>$/.test(raw)) {
+          // Try to find the corresponding closing html token for this tag
+          const tagName = openMatch[1]
+          const escapedTagName = this.escapeRegex(tagName)
+          const closingRegex = new RegExp(`^<\\/\\s*${escapedTagName}\\b`, 'i')
+          let foundIndex = -1
+
+          // Collect intermediate raw parts to reconstruct full HTML fragment
+          const parts: string[] = [raw]
+          for (let j = i + 1; j < tokens.length; j += 1) {
+            const t = tokens[j]
+            const tRaw = (t.raw ?? t.text ?? '').toString()
+            parts.push(tRaw)
+            if (t.type === 'html' && closingRegex.test(tRaw)) {
+              foundIndex = j
+              break
+            }
+          }
+
+          if (foundIndex !== -1) {
+            // Merge opening + inner + closing into one html fragment and parse
+            const mergedRaw = parts.join('')
+            const mergedToken = {
+              type: 'html',
+              raw: mergedRaw,
+              text: mergedRaw,
+              block: false,
+            } as unknown as MarkdownToken
+
+            const parsed = this.parseHTMLToken(mergedToken)
+            if (parsed) {
+              const normalized = this.normalizeParseResult(parsed as any)
+              if (Array.isArray(normalized)) {
+                result.push(...normalized)
+              } else if (normalized) {
+                result.push(normalized)
+              }
+            }
+
+            // Advance i to the closing token
+            i = foundIndex
+            continue
+          }
+        }
+
+        // Fallback: single html token parse
+        const parsedSingle = this.parseHTMLToken(token)
+        if (parsedSingle) {
+          const normalized = this.normalizeParseResult(parsedSingle as any)
+          if (Array.isArray(normalized)) {
+            result.push(...normalized)
+          } else if (normalized) {
+            result.push(normalized)
+          }
+        }
       } else if (token.type) {
         // Handle inline marks (bold, italic, etc.)
         const markHandler = this.getHandlerForToken(token.type)
@@ -415,7 +489,7 @@ export class MarkdownManager {
           result.push(...this.parseInlineTokens(token.tokens))
         }
       }
-    })
+    }
 
     return result
   }
diff --git a/tests/cypress/integration/markdown/mixed-html.spec.ts b/tests/cypress/integration/markdown/mixed-html.spec.ts
@@ -0,0 +1,72 @@
+import { Document } from '@tiptap/extension-document'
+import { Heading } from '@tiptap/extension-heading'
+import { Italic } from '@tiptap/extension-italic'
+import { Paragraph } from '@tiptap/extension-paragraph'
+import { Text } from '@tiptap/extension-text'
+import { MarkdownManager } from '@tiptap/markdown'
+
+describe('MarkdownManager Mixed Markdown + HTML', () => {
+  let manager: MarkdownManager
+  const basicExtensions = [Document, Paragraph, Text, Heading, Italic]
+
+  beforeEach(() => {
+    manager = new MarkdownManager({ extensions: basicExtensions })
+  })
+
+  it('parses heading with inline HTML <em> as italic', () => {
+    const md = '## hello <em>world</em>'
+    const doc = manager.parse(md)
+
+    expect(doc.type).to.equal('doc')
+    expect(doc.content).to.be.an('array')
+    const heading = doc.content[0]
+    expect(heading.type).to.equal('heading')
+    // Find the text node that contains 'world'
+    const textNodes = heading.content.flatMap((n: any) => (n.type === 'text' ? [n] : n.content || []))
+    const worldNode = textNodes.find((n: any) => n.text && n.text.includes('world'))
+    // Use a function-call assertion to avoid the "no-unused-expressions" lint error
+    expect(worldNode).to.not.equal(undefined)
+    // The italic mark should be present
+    expect(worldNode!.marks).to.be.an('array')
+    const hasItalic = worldNode!.marks.some((m: any) => m.type === 'italic')
+    expect(hasItalic).to.equal(true)
+  })
+
+  it('parses standalone inline HTML <em>world</em> as italic', () => {
+    const md = '<em>world</em>'
+    const doc = manager.parse(md)
+
+    expect(doc.type).to.equal('doc')
+    // Inline HTML typically produces a paragraph wrapper
+    const paragraph = doc.content[0]
+    expect(paragraph.type).to.equal('paragraph')
+    const textNode = paragraph.content[0]
+    expect(textNode.text).to.equal('world')
+    expect(textNode.marks).to.be.an('array')
+    const hasItalic = (textNode.marks || []).some((m: any) => m.type === 'italic')
+    expect(hasItalic).to.equal(true)
+  })
+
+  it('parses markdown italic next to HTML italic correctly', () => {
+    const md = '*a* <em>b</em> *c*'
+    const doc = manager.parse(md)
+
+    expect(doc.type).to.equal('doc')
+    const para = doc.content[0]
+    expect(para.type).to.equal('paragraph')
+
+    // Collect texts and their mark states
+    const runs = para.content.map((n: any) => ({ text: n.text, marks: n.marks || [] }))
+    // Expect there to be runs containing a, b, c
+    const texts = runs.map(r => (r.text || '').trim())
+    expect(texts).to.include('a')
+    expect(texts).to.include('b')
+    expect(texts).to.include('c')
+    ;['a', 'b', 'c'].forEach(letter => {
+      const node = runs.find(r => (r.text || '').trim() === letter)
+      expect(node).to.not.equal(undefined)
+      const hasItalic = ((node as any).marks || []).some((m: any) => m.type === 'italic')
+      expect(hasItalic).to.equal(true)
+    })
+  })
+})

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@tiptap/markdown": patch
 +---
++
 +Fix parsing of mixed inline HTML within Markdown content so that inline HTML fragments are parsed correctly.
Original file line number	Diff line number	Diff line change
`@@ -317,6 +317,7 @@ export default defineConfig({`
`317`	`317`	`return () => {`
`318`	`318`	`viteDevServer.middlewares.use(async (req, res, next) => {`
`319`	`319`	`if (req?.originalUrl?.startsWith('/preview')) {`
	`320`	`+ // @ts-expect-error - req.url is not typed but exists at runtime in Vite middleware`
`320`	`321`	`req.url = '/preview/index.html'`
`321`	`322`	`}`
`322`	`323`