Skip to content

Commit 314b663

Browse files
Merge pull request #89 from EnsembleUI/generate-llms
Add functionality to extract descriptions and generate documentation …
2 parents bbe2d49 + 7b071bc commit 314b663

File tree

1 file changed

+243
-10
lines changed

1 file changed

+243
-10
lines changed

scripts/merge-docs.py

Lines changed: 243 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,59 @@ def get_first_heading(file_path):
5050
break
5151
return heading_text
5252

53+
def get_description_content(file_path):
54+
"""Extract description content from index file, starting from 'What is Ensemble?' section."""
55+
with open(file_path, 'r', encoding='utf-8') as f:
56+
in_frontmatter = False
57+
found_what_is = False
58+
description_lines = []
59+
for line in f:
60+
line_stripped = line.strip()
61+
62+
# Skip frontmatter
63+
if line_stripped.startswith("---"):
64+
in_frontmatter = not in_frontmatter
65+
continue
66+
if in_frontmatter:
67+
continue
68+
69+
# Skip HTML comments
70+
if line_stripped.startswith("<!--"):
71+
continue
72+
73+
# Look for "What is Ensemble?" section
74+
if line_stripped.startswith("## What is Ensemble?"):
75+
found_what_is = True
76+
continue
77+
78+
# If we found the section, collect content until next section or end
79+
if found_what_is:
80+
if line_stripped.startswith("##"): # Next section starts
81+
break
82+
if line_stripped: # Non-empty line
83+
description_lines.append(line_stripped)
84+
85+
return " ".join(description_lines) if description_lines else None
86+
87+
def get_full_content(file_path):
88+
"""Extract the full content from a file, skipping YAML frontmatter."""
89+
with open(file_path, 'r', encoding='utf-8') as f:
90+
content_lines = []
91+
in_frontmatter = False
92+
for line in f:
93+
if line.strip().startswith("---"):
94+
in_frontmatter = not in_frontmatter
95+
continue
96+
if in_frontmatter:
97+
continue
98+
content_lines.append(line.rstrip())
99+
100+
# Join and clean up the content
101+
content = '\n'.join(content_lines).strip()
102+
# Remove any remaining HTML comments
103+
content = re.sub(r'<!--.*?-->', '', content, flags=re.DOTALL)
104+
return content
105+
53106
def resolve_entry_path(dir_path, name):
54107
"""
55108
Given a directory and an entry name from _meta.json, try to resolve it to an actual file or directory.
@@ -71,16 +124,26 @@ def process_dir(dir_path, skip_index=False):
71124
"""
72125
meta_file = os.path.join(dir_path, "_meta.json")
73126
entries = []
127+
meta_descriptions = {}
128+
74129
if os.path.isfile(meta_file):
75130
with open(meta_file, 'r', encoding='utf-8') as f:
76131
meta = json.load(f)
77132
for key, val in meta.items():
78-
entries.append((key, val))
133+
if isinstance(val, dict) and 'description' in val:
134+
# Store description for this key
135+
meta_descriptions[key] = val['description']
136+
# Extract title if it exists
137+
title = val.get('title', key)
138+
entries.append((key, title))
139+
else:
140+
entries.append((key, val))
79141
else:
80142
for name in sorted(os.listdir(dir_path)):
81143
if name.startswith('_'):
82144
continue
83145
entries.append((name, None))
146+
84147
nodes = []
85148
for name, title in entries:
86149
resolved = resolve_entry_path(dir_path, name)
@@ -92,6 +155,9 @@ def process_dir(dir_path, skip_index=False):
92155
if base in ("training-videos.md", "training-videos.mdx"):
93156
continue
94157

158+
# Get description from _meta.json if available
159+
meta_description = meta_descriptions.get(name)
160+
95161
if os.path.isfile(resolved):
96162
if skip_index and base in ("index.md", "index.mdx"):
97163
continue
@@ -101,10 +167,12 @@ def process_dir(dir_path, skip_index=False):
101167
page_title = heading_text if heading_text else name
102168
if heading_text is None:
103169
heading_text = page_title
170+
104171
nodes.append({
105172
"title": page_title,
106173
"path": resolved,
107-
"heading": heading_text
174+
"heading": heading_text,
175+
"meta_description": meta_description # Only from _meta.json
108176
})
109177
elif os.path.isdir(resolved):
110178
group_title = title if isinstance(title, str) else name
@@ -117,8 +185,10 @@ def process_dir(dir_path, skip_index=False):
117185
if index_node:
118186
group_node["index_path"] = index_node["path"]
119187
group_node["heading"] = index_node.get("heading", group_title)
188+
group_node["meta_description"] = meta_description # Only from _meta.json
120189
else:
121190
group_node["heading"] = group_title
191+
group_node["meta_description"] = meta_description # Only from _meta.json
122192
nodes.append(group_node)
123193
return nodes
124194

@@ -145,6 +215,114 @@ def generate_toc(nodes, depth=0):
145215
toc_lines.append(f"{indent}- [{title}](#{anchor})")
146216
return toc_lines
147217

218+
def generate_llms_toc(nodes, base_url="https://docs.ensembleui.com"):
219+
"""Generate table of contents in llms.txt format (Cursor style)."""
220+
lines = []
221+
222+
for node in nodes:
223+
if "children" in node:
224+
# This is a section/group
225+
section_title = to_sentence_case(node["title"])
226+
227+
# Add index page if it exists
228+
if node.get("index_path"):
229+
title = node.get("heading", section_title)
230+
rel_path = os.path.relpath(node["index_path"], "pages")
231+
url_path = rel_path.replace("\\", "/").replace(".mdx", "").replace(".md", "")
232+
if url_path == "index":
233+
url_path = ""
234+
elif url_path.endswith("/index"):
235+
url_path = url_path[:-6]
236+
237+
url = f"{base_url}/{url_path}" if url_path else base_url
238+
meta_description = node.get("meta_description")
239+
if meta_description:
240+
lines.append(f"- [{title}]({url}): {meta_description}")
241+
else:
242+
lines.append(f"- [{title}]({url})")
243+
244+
# Add child pages
245+
for child in node["children"]:
246+
if "path" in child:
247+
title = child["title"]
248+
rel_path = os.path.relpath(child["path"], "pages")
249+
url_path = rel_path.replace("\\", "/").replace(".mdx", "").replace(".md", "")
250+
url = f"{base_url}/{url_path}"
251+
meta_description = child.get("meta_description")
252+
if meta_description:
253+
lines.append(f"- [{title}]({url}): {meta_description}")
254+
else:
255+
lines.append(f"- [{title}]({url})")
256+
else:
257+
# This is a standalone page at root level
258+
title = node["title"]
259+
rel_path = os.path.relpath(node["path"], "pages")
260+
url_path = rel_path.replace("\\", "/").replace(".mdx", "").replace(".md", "")
261+
url = f"{base_url}/{url_path}"
262+
meta_description = node.get("meta_description")
263+
if meta_description:
264+
lines.append(f"- [{title}]({url}): {meta_description}")
265+
else:
266+
lines.append(f"- [{title}]({url})")
267+
268+
return lines
269+
270+
def collect_all_pages(nodes):
271+
"""Collect all pages from the structure for full content generation."""
272+
pages = []
273+
274+
for node in nodes:
275+
if "children" in node:
276+
# Add index page if it exists
277+
if node.get("index_path"):
278+
pages.append({
279+
"title": node.get("heading", node["title"]),
280+
"path": node["index_path"],
281+
"url_path": get_url_path(node["index_path"])
282+
})
283+
284+
# Add child pages
285+
pages.extend(collect_all_pages(node["children"]))
286+
else:
287+
# This is a standalone page
288+
pages.append({
289+
"title": node["title"],
290+
"path": node["path"],
291+
"url_path": get_url_path(node["path"])
292+
})
293+
294+
return pages
295+
296+
def get_url_path(file_path):
297+
"""Convert file path to URL path."""
298+
rel_path = os.path.relpath(file_path, "pages")
299+
url_path = rel_path.replace("\\", "/").replace(".mdx", "").replace(".md", "")
300+
if url_path == "index":
301+
url_path = ""
302+
elif url_path.endswith("/index"):
303+
url_path = url_path[:-6]
304+
return url_path
305+
306+
def generate_full_docs(pages, base_url="https://docs.ensembleui.com"):
307+
"""Generate full documentation content in llms-full.txt format (Cursor style)."""
308+
content_blocks = []
309+
310+
for page in pages:
311+
title = page["title"]
312+
file_path = page["path"]
313+
url_path = page["url_path"]
314+
315+
url = f"{base_url}/{url_path}" if url_path else base_url
316+
317+
# Get full content
318+
full_content = get_full_content(file_path)
319+
320+
# Format as Cursor does: # Title \n Source: URL \n Content
321+
block = f"# {title}\nSource: {url}\n{full_content}"
322+
content_blocks.append(block)
323+
324+
return content_blocks
325+
148326
def clean_content(lines):
149327
"""
150328
Clean the content lines by:
@@ -243,14 +421,6 @@ def collect_content(nodes, level=1):
243421
lines.append("")
244422
return lines
245423

246-
# Base directory settings
247-
repo_root = os.getcwd()
248-
pages_dir = os.path.join(repo_root, "pages")
249-
250-
# Process the pages directory.
251-
structure = process_dir(pages_dir, skip_index=True)
252-
253-
# Read the root index.mdx content to place it at the beginning.
254424
def resolve_entry_path_custom(dir_path, name):
255425
"""Helper to resolve an index entry from the given dir."""
256426
for candidate in [name, name + ".md", name + ".mdx"]:
@@ -259,13 +429,39 @@ def resolve_entry_path_custom(dir_path, name):
259429
return path
260430
return None
261431

432+
# Base directory settings
433+
repo_root = os.getcwd()
434+
pages_dir = os.path.join(repo_root, "pages")
435+
public_dir = os.path.join(repo_root, "public")
436+
437+
# Ensure public directory exists
438+
os.makedirs(public_dir, exist_ok=True)
439+
440+
# Process the pages directory.
441+
structure = process_dir(pages_dir, skip_index=True)
442+
443+
# Read the root index.mdx content to place it at the beginning.
262444
index_path = resolve_entry_path_custom(pages_dir, "index")
263445
index_lines = []
446+
main_title = "Ensemble"
447+
main_description = "Documentation for the Ensemble platform"
448+
264449
if index_path and os.path.isfile(index_path):
265450
with open(index_path, 'r', encoding='utf-8') as f:
266451
raw_index = f.read().splitlines()
267452
index_lines = clean_content(raw_index)
453+
454+
# Extract title for llms.txt
455+
title_from_index = get_first_heading(index_path)
456+
if title_from_index:
457+
main_title = title_from_index
458+
459+
# Extract description for llms.txt
460+
description_content = get_description_content(index_path)
461+
if description_content:
462+
main_description = description_content
268463

464+
# Generate README.md
269465
# Assemble the final README content.
270466
output_lines = []
271467
if index_lines:
@@ -287,3 +483,40 @@ def resolve_entry_path_custom(dir_path, name):
287483
out_file.write("\n".join(output_lines))
288484

289485
print("Merged documentation written to README.md")
486+
487+
# Generate llms.txt (table of contents)
488+
toc_lines = []
489+
toc_lines.append(f"# {main_title}")
490+
toc_lines.append("")
491+
toc_lines.append(f"{main_description}")
492+
toc_lines.append("")
493+
toc_lines.append("## Docs")
494+
toc_lines.append("")
495+
496+
# Generate TOC links
497+
toc_content = generate_llms_toc(structure)
498+
toc_lines.extend(toc_content)
499+
500+
# Add optional section at the end (like Cursor does)
501+
toc_lines.append("")
502+
toc_lines.append("## Optional")
503+
toc_lines.append("")
504+
toc_lines.append("- [Website](https://ensembleui.com/)")
505+
toc_lines.append("- [Ensemble Studio](https://studio.ensembleui.com/)")
506+
507+
# Write llms.txt
508+
llms_txt_path = os.path.join(public_dir, "llms.txt")
509+
with open(llms_txt_path, 'w', encoding='utf-8') as f:
510+
f.write("\n".join(toc_lines))
511+
512+
# Generate llms-full.txt (full content)
513+
all_pages = collect_all_pages(structure)
514+
full_content_blocks = generate_full_docs(all_pages)
515+
516+
llms_full_txt_path = os.path.join(public_dir, "llms-full.txt")
517+
with open(llms_full_txt_path, 'w', encoding='utf-8') as f:
518+
f.write("\n".join(full_content_blocks))
519+
520+
print(f"Generated {llms_txt_path} successfully!")
521+
print(f"Generated {llms_full_txt_path} successfully!")
522+
print(f"Total pages in full docs: {len(all_pages)}")

0 commit comments

Comments
 (0)