diff --git a/server/utils/readme.ts b/server/utils/readme.ts index e9ae011dc..297dc5bcd 100644 --- a/server/utils/readme.ts +++ b/server/utils/readme.ts @@ -266,6 +266,30 @@ const reservedPathsNpmJs = [ const npmJsHosts = new Set(['www.npmjs.com', 'npmjs.com', 'www.npmjs.org', 'npmjs.org']) +const USER_CONTENT_PREFIX = 'user-content-' + +function withUserContentPrefix(value: string): string { + return value.startsWith(USER_CONTENT_PREFIX) ? value : `${USER_CONTENT_PREFIX}${value}` +} + +function toUserContentId(value: string): string { + return `${USER_CONTENT_PREFIX}${value}` +} + +function toUserContentHash(value: string): string { + return `#${withUserContentPrefix(value)}` +} + +function normalizePreservedAnchorAttrs(attrs: string): string { + const cleanedAttrs = attrs + .replace(/\s+href\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '') + .replace(/\s+rel\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '') + .replace(/\s+target\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, '') + .trim() + + return cleanedAttrs ? ` ${cleanedAttrs}` : '' +} + const isNpmJsUrlThatCanBeRedirected = (url: URL) => { if (!npmJsHosts.has(url.host)) { return false @@ -291,8 +315,11 @@ function resolveUrl(url: string, packageName: string, repoInfo?: RepositoryInfo) if (!url) return url if (url.startsWith('#')) { // Prefix anchor links to match heading IDs (avoids collision with page IDs) - return `#user-content-${url.slice(1)}` + // Idempotent: don't double-prefix if already prefixed + return toUserContentHash(url.slice(1)) } + // Absolute paths (e.g. /package/foo from a previous npmjs redirect) are already resolved + if (url.startsWith('/')) return url if (hasProtocol(url, { acceptRelative: true })) { try { const parsed = new URL(url, 'https://example.com') @@ -381,8 +408,8 @@ function resolveImageUrl(url: string, packageName: string, repoInfo?: Repository // Helper to prefix id attributes with 'user-content-' function prefixId(tagName: string, attribs: sanitizeHtml.Attributes) { - if (attribs.id && !attribs.id.startsWith('user-content-')) { - attribs.id = `user-content-${attribs.id}` + if (attribs.id) { + attribs.id = withUserContentPrefix(attribs.id) } return { tagName, attribs } } @@ -421,35 +448,53 @@ export async function renderReadmeHtml( // So README starts at h3, and we ensure no levels are skipped // Visual styling preserved via data-level attribute (original depth) let lastSemanticLevel = 2 // Start after h2 (the "Readme" section heading) - renderer.heading = function ({ tokens, depth }: Tokens.Heading) { - // Calculate the target semantic level based on document structure - // Start at h3 (since page h1 + section h2 already exist) - // But ensure we never skip levels - can only go down by 1 or stay same/go up + + // Shared heading processing for both markdown and HTML headings + function processHeading(depth: number, plainText: string, preservedAttrs = '') { const semanticLevel = calculateSemanticDepth(depth, lastSemanticLevel) lastSemanticLevel = semanticLevel - const text = this.parser.parseInline(tokens) - // Generate GitHub-style slug for anchor links - let slug = slugify(text) - if (!slug) slug = 'heading' // Fallback for empty headings + let slug = slugify(plainText) + if (!slug) slug = 'heading' - // Handle duplicate slugs (GitHub-style: foo, foo-1, foo-2) const count = usedSlugs.get(slug) ?? 0 usedSlugs.set(slug, count + 1) const uniqueSlug = count === 0 ? slug : `${slug}-${count}` + const id = toUserContentId(uniqueSlug) - // Prefix with 'user-content-' to avoid collisions with page IDs - // (e.g., #install, #dependencies, #versions are used by the package page) - const id = `user-content-${uniqueSlug}` - - // Collect TOC item with plain text (HTML stripped, entities decoded) - const plainText = decodeHtmlEntities(stripHtmlTags(text).trim()) if (plainText) { toc.push({ text: plainText, id, depth }) } - /** The link href uses the unique slug WITHOUT the 'user-content-' prefix, because that will later be added for all links. */ - return `${plainText}\n` + return `${plainText}\n` + } + + renderer.heading = function ({ tokens, depth }: Tokens.Heading) { + const text = this.parser.parseInline(tokens) + const plainText = decodeHtmlEntities(stripHtmlTags(text).trim()) + return processHeading(depth, plainText) + } + + // Intercept HTML headings so they get id, TOC entry, and correct semantic level. + // Also intercept raw HTML tags so playground links are collected in the same pass. + const htmlHeadingRe = /]*)?>([\s\S]*?)<\/h\1>/gi + const htmlAnchorRe = /]*?)href=(["'])([^"']*)\2([^>]*)>([\s\S]*?)<\/a>/gi + renderer.html = function ({ text }: Tokens.HTML) { + let result = text.replace(htmlHeadingRe, (_, level, attrs = '', inner) => { + const depth = parseInt(level) + const plainText = decodeHtmlEntities(stripHtmlTags(inner).trim()) + const align = /\balign=(["'])(.*?)\1/i.exec(attrs)?.[2] + const preservedAttrs = align ? ` align="${align}"` : '' + return processHeading(depth, plainText, preservedAttrs).trimEnd() + }) + // Process raw HTML tags for playground link collection and URL resolution + result = result.replace(htmlAnchorRe, (_full, beforeHref, _quote, href, afterHref, inner) => { + const label = decodeHtmlEntities(stripHtmlTags(inner).trim()) + const { resolvedHref, extraAttrs } = processLink(href, label) + const preservedAttrs = normalizePreservedAnchorAttrs(`${beforeHref ?? ''}${afterHref ?? ''}`) + return `${inner}` + }) + return result } // Syntax highlighting for code blocks (uses shared highlighter) @@ -473,7 +518,35 @@ ${html} return `` } + // Helper: resolve a link href, collect playground links, and build attributes. + // Used by both the markdown renderer.link and the HTML interceptor so that + // all link processing happens in a single pass during marked rendering. + function processLink(href: string, label: string): { resolvedHref: string; extraAttrs: string } { + const resolvedHref = resolveUrl(href, packageName, repoInfo) + + // Collect playground links + const provider = matchPlaygroundProvider(resolvedHref) + if (provider && !seenUrls.has(resolvedHref)) { + seenUrls.add(resolvedHref) + collectedLinks.push({ + url: resolvedHref, + provider: provider.id, + providerName: provider.name, + label: decodeHtmlEntities(label || provider.name), + }) + } + + // Security attributes for external links + let extraAttrs = '' + if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) { + extraAttrs = ' rel="nofollow noreferrer noopener" target="_blank"' + } + + return { resolvedHref, extraAttrs } + } + // Resolve link URLs, add security attributes, and collect playground links + // — all in a single pass during marked rendering (no deferred processing) renderer.link = function ({ href, title, tokens }: Tokens.Link) { const text = this.parser.parseInline(tokens) const titleAttr = title ? ` title="${title}"` : '' @@ -484,10 +557,9 @@ ${html} plainText = tokens[0].text } - const intermediateTitleAttr = - plainText || title ? ` data-title-intermediate="${plainText || title}"` : '' + const { resolvedHref, extraAttrs } = processLink(href, plainText || title || '') - return `${text}` + return `${text}` } // GitHub-style callouts: > [!NOTE], > [!TIP], etc. @@ -515,26 +587,32 @@ ${html} allowedSchemes: ['http', 'https', 'mailto'], // Transform img src URLs (GitHub blob → raw, relative → GitHub raw) transformTags: { + // Headings are already processed to correct semantic levels by processHeading() + // during the marked rendering pass. The sanitizer just needs to preserve them. + // For any stray headings that didn't go through processHeading (shouldn't happen), + // we still apply a safe fallback shift. h1: (_, attribs) => { + if (attribs['data-level']) return { tagName: 'h1', attribs } return { tagName: 'h3', attribs: { ...attribs, 'data-level': '1' } } }, h2: (_, attribs) => { + if (attribs['data-level']) return { tagName: 'h2', attribs } return { tagName: 'h4', attribs: { ...attribs, 'data-level': '2' } } }, h3: (_, attribs) => { - if (attribs['data-level']) return { tagName: 'h3', attribs: attribs } + if (attribs['data-level']) return { tagName: 'h3', attribs } return { tagName: 'h5', attribs: { ...attribs, 'data-level': '3' } } }, h4: (_, attribs) => { - if (attribs['data-level']) return { tagName: 'h4', attribs: attribs } + if (attribs['data-level']) return { tagName: 'h4', attribs } return { tagName: 'h6', attribs: { ...attribs, 'data-level': '4' } } }, h5: (_, attribs) => { - if (attribs['data-level']) return { tagName: 'h5', attribs: attribs } + if (attribs['data-level']) return { tagName: 'h5', attribs } return { tagName: 'h6', attribs: { ...attribs, 'data-level': '5' } } }, h6: (_, attribs) => { - if (attribs['data-level']) return { tagName: 'h6', attribs: attribs } + if (attribs['data-level']) return { tagName: 'h6', attribs } return { tagName: 'h6', attribs: { ...attribs, 'data-level': '6' } } }, img: (tagName, attribs) => { @@ -562,6 +640,11 @@ ${html} } return { tagName, attribs } }, + // Markdown links are fully processed in renderer.link (single-pass). + // However, inline HTML tags inside paragraphs are NOT seen by + // renderer.html (marked parses them as paragraph tokens, not html tokens). + // So we still need to collect playground links here for those cases. + // The seenUrls set ensures no duplicates across both paths. a: (tagName, attribs) => { if (!attribs.href) { return { tagName, attribs } @@ -569,24 +652,22 @@ ${html} const resolvedHref = resolveUrl(attribs.href, packageName, repoInfo) + // Collect playground links from inline HTML tags that weren't + // caught by renderer.link or renderer.html const provider = matchPlaygroundProvider(resolvedHref) if (provider && !seenUrls.has(resolvedHref)) { seenUrls.add(resolvedHref) - collectedLinks.push({ url: resolvedHref, provider: provider.id, providerName: provider.name, - /** - * We need to set some data attribute before hand because `transformTags` doesn't - * provide the text of the element. This will automatically be removed, because there - * is an allow list for link attributes. - * */ - label: decodeHtmlEntities(attribs['data-title-intermediate'] || provider.name), + // sanitize-html transformTags doesn't provide element text content, + // so we fall back to the provider name for the label + label: provider.name, }) } - // Add security attributes for external links + // Add security attributes for external links (idempotent) if (resolvedHref && hasProtocol(resolvedHref, { acceptRelative: true })) { attribs.rel = 'nofollow noreferrer noopener' attribs.target = '_blank' diff --git a/test/unit/server/utils/readme.spec.ts b/test/unit/server/utils/readme.spec.ts index 479bfae4a..099459754 100644 --- a/test/unit/server/utils/readme.spec.ts +++ b/test/unit/server/utils/readme.spec.ts @@ -561,4 +561,247 @@ describe('HTML output', () => {

Some bold text and a link.

`) }) + + it('adds id to raw HTML headings', async () => { + const result = await renderReadmeHtml('

Title

', 'test-pkg') + expect(result.html).toContain('id="user-content-title"') + expect(result.toc).toHaveLength(1) + expect(result.toc[0]).toMatchObject({ text: 'Title', depth: 1 }) + }) + + it('adds id to HTML heading in multi-element token', async () => { + const md = '

My Package

\n

A description

' + const result = await renderReadmeHtml(md, 'test-pkg') + expect(result.toc).toHaveLength(1) + expect(result.html).toContain('id="user-content-my-package"') + expect(result.toc[0]).toMatchObject({ text: 'My Package', depth: 1 }) + }) + + it('handles duplicate raw HTML heading slugs', async () => { + const md = '

API

\n\n

API

' + const result = await renderReadmeHtml(md, 'test-pkg') + expect(result.html).toContain('id="user-content-api"') + expect(result.html).toContain('id="user-content-api-1"') + }) + + it('preserves supported attributes on raw HTML headings', async () => { + const md = '

My Package

' + const result = await renderReadmeHtml(md, 'test-pkg') + + expect(result.html).toContain('id="user-content-my-package"') + expect(result.html).toContain('align="center"') + }) + + it('preserves supported attributes on rewritten raw HTML anchors (renderer.html path)', async () => { + const md = [ + '
', + ' Open in StackBlitz', + '
', + ].join('\n') + const result = await renderReadmeHtml(md, 'test-pkg') + + expect(result.html).toContain('href="https://stackblitz.com/edit/my-demo"') + expect(result.html).toContain('title="Open demo"') + expect(result.html).toContain('rel="nofollow noreferrer noopener"') + expect(result.html).toContain('target="_blank"') + }) + + it('preserves title when it appears before href (renderer.html path)', async () => { + const md = [ + '
', + ' Open in StackBlitz', + '
', + ].join('\n') + const result = await renderReadmeHtml(md, 'test-pkg') + + expect(result.html).toContain('title="Open demo"') + expect(result.html).toContain('href="https://stackblitz.com/edit/my-demo"') + expect(result.html).toContain('rel="nofollow noreferrer noopener"') + expect(result.html).toContain('target="_blank"') + }) + + it('overrides existing rel and target instead of duplicating them (renderer.html path)', async () => { + const md = [ + '
', + ' Open in StackBlitz', + '
', + ].join('\n') + const result = await renderReadmeHtml(md, 'test-pkg') + + expect(result.html).toContain('rel="nofollow noreferrer noopener"') + expect(result.html).toContain('target="_blank"') + expect(result.html).not.toContain('rel="bookmark"') + expect(result.html).not.toContain('target="_self"') + }) +}) + +/** + * Tests for issue #1323: single-pass markdown rendering behavior. + * + * The core concern is that mixing markdown headings and raw HTML headings + * must produce TOC entries, heading IDs, and duplicate-slug suffixes in + * exact document order — the same as GitHub does. + * + * If the implementation processes markdown headings in one pass and HTML + * headings in a separate (later) pass, the ordering will be wrong. + */ +describe('Issue #1323 — single-pass rendering correctness', () => { + describe('mixed markdown + HTML headings: TOC order and IDs', () => { + it('produces TOC entries in document order when markdown and HTML headings are interleaved', async () => { + // This is the core scenario from the issue: HTML headings appear + // between markdown headings. A two-pass approach would collect all + // markdown headings first, then HTML headings — scrambling the order. + const md = [ + '# First (markdown)', + '', + '

Second (html)

', + '', + '## Third (markdown)', + '', + '

Fourth (html)

', + '', + '## Fifth (markdown)', + ].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + // TOC must reflect exact document order + expect(result.toc).toHaveLength(5) + expect(result.toc[0]!.text).toBe('First (markdown)') + expect(result.toc[1]!.text).toBe('Second (html)') + expect(result.toc[2]!.text).toBe('Third (markdown)') + expect(result.toc[3]!.text).toBe('Fourth (html)') + expect(result.toc[4]!.text).toBe('Fifth (markdown)') + }) + + it('assigns duplicate-slug suffixes in document order across mixed heading types', async () => { + // Two markdown "API" headings with an HTML "API" heading in between. + // Correct: api, api-1, api-2 in that order. + // If HTML headings are processed in a separate pass, the HTML one + // could get suffix -2 while the last markdown one gets -1. + const md = ['## API', '', '

API

', '', '## API'].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + expect(result.toc).toHaveLength(3) + expect(result.toc[0]!.id).toBe('user-content-api') + expect(result.toc[1]!.id).toBe('user-content-api-1') + expect(result.toc[2]!.id).toBe('user-content-api-2') + + // The HTML output must also have these IDs in order + const ids = Array.from(result.html.matchAll(/id="(user-content-api(?:-\d+)?)"/g), m => m[1]) + expect(ids).toEqual(['user-content-api', 'user-content-api-1', 'user-content-api-2']) + }) + + it('does not collide when heading text already starts with user-content-', async () => { + const md = ['# Title', '', '# user-content-title'].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + const ids = Array.from(result.html.matchAll(/id="(user-content-[^"]+)"/g), m => m[1]) + expect(ids).toEqual(['user-content-title', 'user-content-user-content-title']) + expect(new Set(ids).size).toBe(ids.length) + expect(result.toc.map(t => t.id)).toEqual(ids) + }) + + it('heading semantic levels are sequential even when mixing heading types', async () => { + // h1 (md) → h3, h3 (html) → should be h4 (max = lastSemantic + 1), + // not jump to h5 or h6 because it was processed in a later pass. + const md = ['# Title', '', '

Subsection

', '', '#### Deep'].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + // Extract semantic tags in order from the HTML + const tags = Array.from(result.html.matchAll(/ Number(m[1])) + // h1→h3, h3→h4 (sequential after h3), h4→h5 (sequential after h4) + expect(tags).toEqual([3, 4, 5]) + }) + }) + + describe('HTML heading between markdown headings — ID and href consistency', () => { + it('heading id and its anchor href point to the same slug', async () => { + const md = ['# Introduction', '', '

Getting Started

', '', '## Installation'].join( + '\n', + ) + + const result = await renderReadmeHtml(md, 'test-pkg') + + // For every heading, the slug used in id="user-content-{slug}" must + // match the slug in the child anchor href="#user-content-{slug}" + // (resolveUrl prefixes # anchors with user-content-). + const headingPairs = [ + ...result.html.matchAll(/id="user-content-([^"]+)"[^>]*>/g), + ] + expect(headingPairs.length).toBeGreaterThan(0) + for (const match of headingPairs) { + // slug portion must be identical + expect(match[1]).toBe(match[2]) + } + }) + }) + + describe('playground links collected from HTML tags in single pass', () => { + it('collects playground links from raw HTML anchor tags', async () => { + // Some READMEs (like eslint's sponsor section) use raw HTML tags + // instead of markdown link syntax. These must also be picked up. + const md = [ + '# My Package', + '', + 'Open in StackBlitz', + '', + 'Some text with a [CodeSandbox link](https://codesandbox.io/s/example)', + ].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + // Both playground links should be collected regardless of syntax + const providers = result.playgroundLinks.map(l => l.provider) + expect(providers).toContain('stackblitz') + expect(providers).toContain('codesandbox') + }) + }) + + describe('complex real-world interleaving (atproxy-like)', () => { + it('handles a README with HTML h1 followed by markdown h2 and mixed content', async () => { + // Simulates a pattern like atproxy's README where h1 is HTML + // and subsequent headings are markdown + const md = [ + '

atproxy

', + '

A cool proxy library

', + '', + '## Features', + '', + '- Fast', + '- Simple', + '', + '## Installation', + '', + '```bash', + 'npm install atproxy', + '```', + '', + '

Advanced Usage

', + '', + '## API', + '', + '### Methods', + ].join('\n') + + const result = await renderReadmeHtml(md, 'test-pkg') + + // TOC order must be: atproxy, Features, Installation, Advanced Usage, API, Methods + expect(result.toc.map(t => t.text)).toEqual([ + 'atproxy', + 'Features', + 'Installation', + 'Advanced Usage', + 'API', + 'Methods', + ]) + + // All IDs should be unique + const ids = result.toc.map(t => t.id) + expect(new Set(ids).size).toBe(ids.length) + }) + }) })