diff --git a/package.json b/package.json index 5e53637..eb507ba 100644 --- a/package.json +++ b/package.json @@ -1,8 +1,20 @@ { - "name": "powertools-mcp", - "version": "1.0.0", + "name": "@serverless-dna/powertools-mcp", + "version": "0.2.0", "description": "Powertools for AWS Lambda Documentation MCP Server", "main": "dist/bundle.js", + "bin": { + "powertools-mcp": "dist/bundle.js" + }, + "files": [ + "dist/", + "indexes/", + "README.md", + "LICENSE" + ], + "publishConfig": { + "access": "public" + }, "repository": { "type": "git", "url": "https://github.com/serverless-dna/powertools-mcp.git" @@ -14,6 +26,7 @@ "scripts": { "prebuild": "rimraf dist/* && pnpm lint", "build": "rollup -c", + "postbuild": "chmod +x dist/bundle.js", "test": "jest", "lint": "eslint --config eslint.config.mjs", "test:ci": "jest --ci", @@ -21,8 +34,16 @@ "postversion": "pnpm build", "release": "semantic-release" }, - "keywords": [], - "author": "", + "keywords": [ + "aws", + "lambda", + "powertools", + "documentation", + "mcp", + "model-context-protocol", + "llm" + ], + "author": "Serverless DNA", "license": "ISC", "packageManager": "pnpm@10.8.0", "dependencies": { @@ -33,6 +54,7 @@ "html-to-markdown": "^1.0.0", "lunr": "^2.3.9", "lunr-languages": "^1.14.0", + "turndown": "^7.2.0", "zod": "^3.24.3", "zod-to-json-schema": "^3.24.5" }, @@ -47,6 +69,7 @@ "@semantic-release/github": "^11.0.1", "@types/jest": "^29.5.14", "@types/lunr": "^2.3.7", + "@types/turndown": "^5.0.5", "@typescript-eslint/eslint-plugin": "^8.30.1", "@typescript-eslint/parser": "^8.30.1", "eslint": "^9.25.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index bde6972..d1c3703 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -29,6 +29,9 @@ importers: lunr-languages: specifier: ^1.14.0 version: 1.14.0 + turndown: + specifier: ^7.2.0 + version: 7.2.0 zod: specifier: ^3.24.3 version: 3.24.3 @@ -66,6 +69,9 @@ importers: '@types/lunr': specifier: ^2.3.7 version: 2.3.7 + '@types/turndown': + specifier: ^5.0.5 + version: 5.0.5 '@typescript-eslint/eslint-plugin': specifier: ^8.30.1 version: 8.30.1(@typescript-eslint/parser@8.30.1(eslint@9.25.0)(typescript@5.8.3))(eslint@9.25.0)(typescript@5.8.3) @@ -836,6 +842,9 @@ packages: '@types/stack-utils@2.0.3': resolution: {integrity: sha512-9aEbYZ3TbYMznPdcdr3SmIrLXwC/AKZXQeCf9Pgao5CKb8CyHuEX5jzWPTkvregvhRJHcpRO6BFoGW9ycaOkYw==} + '@types/turndown@5.0.5': + resolution: {integrity: sha512-TL2IgGgc7B5j78rIccBtlYAnkuv8nUQqhQc+DSYV5j9Be9XOcm/SKOVRuA47xAVI3680Tk9B1d8flK2GWT2+4w==} + '@types/yargs-parser@21.0.3': resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==} @@ -3495,6 +3504,9 @@ packages: tslib@2.8.1: resolution: {integrity: sha512-oJFu94HQb+KVduSUQL7wnpmqnfmLsOA/nAh6b6EH0wCEoK0/mPeXU6c3wKDV83MkOuHPRHtSXKKU99IBazS/2w==} + turndown@7.2.0: + resolution: {integrity: sha512-eCZGBN4nNNqM9Owkv9HAtWRYfLA4h909E/WGAWWBpmB275ehNhZyk87/Tpvjbp0jjNl9XwCsbe6bm6CqFsgD+A==} + type-check@0.4.0: resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==} engines: {node: '>= 0.8.0'} @@ -4588,6 +4600,8 @@ snapshots: '@types/stack-utils@2.0.3': {} + '@types/turndown@5.0.5': {} + '@types/yargs-parser@21.0.3': {} '@types/yargs@17.0.33': @@ -7578,6 +7592,10 @@ snapshots: tslib@2.8.1: {} + turndown@7.2.0: + dependencies: + '@mixmark-io/domino': 2.2.0 + type-check@0.4.0: dependencies: prelude-ls: 1.2.1 diff --git a/rollup.config.js b/rollup.config.js index ca41951..0a9f683 100644 --- a/rollup.config.js +++ b/rollup.config.js @@ -2,13 +2,15 @@ const commonjs = require('@rollup/plugin-commonjs'); const json = require('@rollup/plugin-json'); const resolve = require('@rollup/plugin-node-resolve'); const typescript = require('@rollup/plugin-typescript'); +const terser = require('@rollup/plugin-terser'); module.exports = { input: 'src/index.ts', output: { file: 'dist/bundle.js', format: 'cjs', - sourcemap: true + sourcemap: true, + banner: '#!/usr/bin/env node\n' // Add shebang line for executable }, plugins: [ // Handle TypeScript files @@ -32,6 +34,7 @@ module.exports = { transformMixedEsModules: true, }), json(), + terser(), // Minify the output ], // Empty external array means include everything external: [] diff --git a/src/docFetcher.ts b/src/docFetcher.ts index 5cec3c3..3174205 100644 --- a/src/docFetcher.ts +++ b/src/docFetcher.ts @@ -1,13 +1,17 @@ -import * as cheerio from 'cheerio'; +// Import domino using dynamic import to avoid TypeScript module issues +// @ts-expect-error - Importing domino which doesn't have proper TypeScript definitions +import domino from '@mixmark-io/domino'; +import TurndownService from 'turndown'; // Allowed domain for security const ALLOWED_DOMAIN = 'docs.powertools.aws.dev'; // Constants for performance tuning const FETCH_TIMEOUT_MS = 15000; // 15 seconds timeout for fetch operations -const PROCESSING_BATCH_SIZE = 10; // Number of elements to process in a batch -const MAX_RECURSION_DEPTH = 5; // Maximum recursion depth for element processing -const CHUNK_SIZE_THRESHOLD = 10000; // Minimum HTML size to process in chunks + +// Add a simple cache for documentation pages +const docCache = new Map(); +const CACHE_TTL = 3600000; // 1 hour in milliseconds /** * Validates that a URL belongs to the allowed domain @@ -24,114 +28,78 @@ function isValidUrl(url: string): boolean { } /** - * Converts an HTML element to markdown with recursion depth limit - * @param $ The cheerio API instance - * @param elem The element to convert - * @param depth Current recursion depth - * @returns The markdown string + * Configure Turndown with custom rules for better Markdown conversion + * @returns Configured Turndown service */ -function elementToMarkdown($: cheerio.CheerioAPI, elem: any, depth: number = 0): string { - // Limit recursion depth to prevent stack overflow - if (depth > MAX_RECURSION_DEPTH) { - return ''; - } +function configureTurndown(): TurndownService { + const turndownService = new TurndownService({ + headingStyle: 'atx', + codeBlockStyle: 'fenced', + emDelimiter: '*', + bulletListMarker: '*', + strongDelimiter: '**' + }); - const tagName = elem.tagName.toLowerCase(); - const $elem = $(elem); - const text = $elem.text().trim(); - - if (!text) return ''; - - // Handle code blocks with syntax highlighting - if (tagName === 'pre' && $elem.find('code').length > 0) { - const $code = $elem.find('code').first(); - const codeText = $code.text().trim(); - const codeClass = $code.attr('class') || ''; - const lang = codeClass.match(/language-(\w+)/)?.[1] || ''; - return `\`\`\`${lang}\n${codeText}\n\`\`\`\n\n`; - } - - switch (tagName) { - case 'h1': - return `# ${text}\n\n`; - case 'h2': - return `## ${text}\n\n`; - case 'h3': - return `### ${text}\n\n`; - case 'h4': - return `#### ${text}\n\n`; - case 'h5': - return `##### ${text}\n\n`; - case 'h6': - return `###### ${text}\n\n`; - case 'p': - return `${text}\n\n`; - case 'ul': { - // Process list items with depth control - let ulMarkdown = '\n'; - $elem.find('> li').each((i, li) => { - // Increment depth for child elements - ulMarkdown += `* ${$(li).text().trim()}\n`; - }); - return ulMarkdown + '\n'; - } - case 'ol': { - // Process ordered list items with depth control - let olMarkdown = '\n'; - $elem.find('> li').each((i, li) => { - olMarkdown += `${i+1}. ${$(li).text().trim()}\n`; - }); - return olMarkdown + '\n'; + // Improve code block handling + turndownService.addRule('fencedCodeBlock', { + filter: (node): boolean => { + return ( + node.nodeName === 'PRE' && + node.firstChild !== null && + node.firstChild.nodeName === 'CODE' + ); + }, + replacement: (content, node) => { + const code = node.firstChild as HTMLElement; + const className = code.getAttribute('class') || ''; + const language = className.match(/language-(\w+)/)?.[1] || ''; + return `\n\`\`\`${language}\n${code.textContent}\n\`\`\`\n\n`; } - case 'li': - return `* ${text}\n`; - case 'a': { - const href = $elem.attr('href'); - return href ? `[${text}](${href})` : text; + }); + + // Improve table handling + turndownService.addRule('tableRule', { + filter: 'table', + replacement: (content) => { + // For complex tables, we might want to keep the HTML + return content.trim() ? `\n\n${content}\n\n` : ''; } - case 'pre': - case 'code': - return `\`\`\`\n${text}\n\`\`\`\n\n`; - case 'table': - // Basic table support - return `${$elem.html()}
\n\n`; - default: - return text ? `${text}\n\n` : ''; - } + }); + + return turndownService; } /** - * Process DOM elements in batches to prevent thread blocking - * @param $ The cheerio API instance - * @param elements Array of elements to process - * @returns The markdown string + * Extract content from HTML string using domino + * @param html The HTML string to process + * @returns Object containing title and main content element */ -async function processDomElementsInBatches($: cheerio.CheerioAPI, elements: any[]): Promise { - let markdown = ''; +function extractContent(html: string): { title: string, content: string } { + // Create a DOM document using domino + const doc = domino.createDocument(html); - // Process elements in smaller batches - for (let i = 0; i < elements.length; i += PROCESSING_BATCH_SIZE) { - const batch = elements.slice(i, i + PROCESSING_BATCH_SIZE); - - for (const elem of batch) { - markdown += elementToMarkdown($, elem); - } - - // Allow event loop to continue by yielding execution - if (i + PROCESSING_BATCH_SIZE < elements.length) { - await new Promise(resolve => setTimeout(resolve, 0)); - } - } + // Remove script and style tags + const scripts = doc.querySelectorAll('script, style'); + scripts.forEach((script: Element) => script.parentNode?.removeChild(script)); + + // Get the title + const titleElement = doc.querySelector('h1') || doc.querySelector('title'); + const title = titleElement ? titleElement.textContent?.trim() || '' : ''; - return markdown; + // Extract the main content - specifically target the md-content container + const mainContent = doc.querySelector('div.md-content[data-md-component="content"]'); + + // If we found the main content container, use it; otherwise fall back to body + const contentElement = mainContent || doc.body; + + return { + title, + content: contentElement.innerHTML + }; } -// Add a simple cache for documentation pages -const docCache = new Map(); -const CACHE_TTL = 3600000; // 1 hour in milliseconds - /** - * Fetches a documentation page and converts it to markdown using streaming + * Fetches a documentation page and converts it to markdown using Turndown * Specifically targets the div.md-content[data-md-component="content"] container * Includes caching to reduce repeated requests * @param url The URL of the documentation page to fetch @@ -161,7 +129,7 @@ export async function fetchDocPage(url: string): Promise { const timeoutId = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); try { - // Fetch the HTML content with streaming and timeout + // Fetch the HTML content with timeout const response = await fetch(url, { signal: controller.signal }); // Clear the timeout as request completed @@ -171,89 +139,30 @@ export async function fetchDocPage(url: string): Promise { throw new Error(`Failed to fetch page: ${response.status} ${response.statusText}`); } - // Get the response as a stream - const bodyStream = response.body; + // Get the HTML content + const html = await response.text(); - if (!bodyStream) { - throw new Error('Response body stream is null'); - } + // Extract content from HTML + const { title, content } = extractContent(html); - // Create a readable stream from the response body - const reader = bodyStream.getReader(); - const decoder = new TextDecoder(); + // Configure Turndown + const turndownService = configureTurndown(); - let html = ''; + // Convert the HTML to Markdown let markdown = ''; - // Process the stream in chunks - while (true) { - const { done, value } = await reader.read(); - - if (done) { - break; - } - - // Decode the chunk and append to HTML - const chunk = decoder.decode(value, { stream: true }); - html += chunk; - - // If we have enough HTML to start processing, do it in chunks - if (html.length > CHUNK_SIZE_THRESHOLD && html.includes('')) { - // Process this chunk - const $ = cheerio.load(html); - - // Remove script and style tags - $('script, style').remove(); - - // Extract the main content - specifically target the md-content container - const mainContent = $('div.md-content[data-md-component="content"]'); - const contentToProcess = mainContent.length > 0 ? mainContent : $('body'); - - // Process title only once - if (markdown === '') { - const title = $('h1').first().text().trim() || $('title').text().trim(); - if (title) { - markdown += `# ${title}\n\n`; - } - } - - // Process elements in batches - const elements = contentToProcess.children().toArray(); - markdown += await processDomElementsInBatches($, elements); - - // Reset HTML buffer to avoid reprocessing - html = ''; - } + // Add title if available + if (title) { + markdown = `# ${title}\n\n`; } - // Process any remaining HTML - if (html.length > 0) { - const $ = cheerio.load(html); - - // Remove script and style tags - $('script, style').remove(); - - // Extract the main content - specifically target the md-content container - const mainContent = $('div.md-content[data-md-component="content"]'); - const contentToProcess = mainContent.length > 0 ? mainContent : $('body'); - - // Process title if we haven't yet - if (markdown === '') { - const title = $('h1').first().text().trim() || $('title').text().trim(); - if (title) { - markdown += `# ${title}\n\n`; - } - } - - // Process elements in batches - const elements = contentToProcess.children().toArray(); - markdown += await processDomElementsInBatches($, elements); - } + // Convert the main content to Markdown + markdown += turndownService.turndown(content); // If we didn't extract much structured content, fall back to text if (markdown.length < 100) { - const $ = cheerio.load(html); - const bodyText = $('body').text().trim(); + const doc = domino.createDocument(html); + const bodyText = doc.body.textContent?.trim() || ''; if (bodyText) { markdown = bodyText; }