Skip to content

find uningested docs sites #753

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions packages/ingest-mongodb-public/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
"node": ">=18",
"npm": ">=8"
},
"module": "./build/index.js",
"main": "./build/index.js",
"types": "./build/index.d.ts",
"scripts": {
"preinstall": "npx playwright install chromium --with-deps",
"clean": "rm -rf build",
Expand Down
1 change: 1 addition & 0 deletions packages/ingest-mongodb-public/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from "./sources";
1 change: 0 additions & 1 deletion packages/ingest-mongodb-public/src/modules.d.ts

This file was deleted.

10 changes: 7 additions & 3 deletions packages/ingest-mongodb-public/src/sources/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ import {
MakeMongoDbUniversityDataSourceParams,
makeMongoDbUniversityDataSource,
} from "./mongodb-university";
const { DEVCENTER_CONNECTION_URI, UNIVERSITY_DATA_API_KEY } = assertEnvVars(
PUBLIC_INGEST_ENV_VARS
);
import {
getUrlsFromSitemap,
initialWebSources,
Expand All @@ -38,6 +35,13 @@ import {
} from "./mongodbDotCom";
import { chromium } from "playwright";

const { DEVCENTER_CONNECTION_URI, UNIVERSITY_DATA_API_KEY } = assertEnvVars(
PUBLIC_INGEST_ENV_VARS
);

export { snootyProjectConfig };
export * from "./snooty/SnootyDataSource";

/**
Async constructor for specific data sources -- parameters baked in.
*/
Expand Down
10 changes: 8 additions & 2 deletions packages/ingest-mongodb-public/src/sources/snootySources.ts
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,14 @@ export const snootyProjectConfig: LocallySpecifiedSnootyProjectConfig[] = [
{
type: "snooty",
name: "mck",
tags: ["docs", "kubernetes", "k8s", "kubernetes-controllers", "kubernetes-operator"],
productName: "MongoDB Controllers for Kubernetes"
tags: [
"docs",
"kubernetes",
"k8s",
"kubernetes-controllers",
"kubernetes-operator",
],
productName: "MongoDB Controllers for Kubernetes",
},
{
type: "snooty",
Expand Down
4 changes: 4 additions & 0 deletions packages/ingest-mongodb-public/src/turndown-plugin-gfm.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
declare module "turndown-plugin-gfm" {
import { Plugin as TurndownPlugin } from "turndown";
export function gfm(): TurndownPlugin;
}
5 changes: 4 additions & 1 deletion packages/ingest-mongodb-public/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,8 @@
"compilerOptions": {
"outDir": "./build"
},
"include": ["./src/**/*.ts"]
"include": [
"./src/**/*.ts",
"./src/**/*.d.ts"
]
}
2 changes: 2 additions & 0 deletions packages/scripts/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"removeTestDatabases": "npm run build && node ./build/removeTestDatabases.js",
"getConversationText": "npm run build && node ./build/getConversationText.js",
"findPageTitles": "npm run build && node ./build/main/findPageTitlesMain.js",
"findUningestedDocsSites": "npm run build && node ./build/findUningestedDocsSites.js",
"listSlackMessages": "npm run build && node ./build/main/listSlackMessagesMain.js",
"removeSlackMessage": "npm run build && node ./build/main/removeSlackMessageMain.js",
"checkUrlsAgainstDB": "npm run build && node ./build/checkUrlsAgainstDB.js",
Expand All @@ -48,6 +49,7 @@
"mongodb-chatbot-server": "*",
"mongodb-rag-core": "*",
"chatbot-server-mongodb-public": "*",
"ingest-mongodb-public": "*",
"yaml": "^2.3.4",
"yargs": "^17.7.2"
},
Expand Down
74 changes: 74 additions & 0 deletions packages/scripts/src/findUningestedDocsSites.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import { type SnootyProject, snootyProjectConfig } from "ingest-mongodb-public";

const deprecatedProjectNames = [
"atlas-app-services",
"atlas-open-service-broker",
"datalake",
"guides",
"realm",
];

const omittedProjects = [
...deprecatedProjectNames.map((name) => ({
name,
deprecated: true,
note: "Deprecated",
})),
{
name: "mongoid-railsmdb",
deprecated: false,
note: "Supposed to be a repo for a new docset but the project got deprioritized so all that's in there right now is a (potentially outdated) Getting Started guide",
},
];

function getOmittedProject(projectName: string) {
return omittedProjects.find((p) => p.name === projectName);
}

async function listDocsProjectsFromApi() {
const apiBaseUrl = "https://snooty-data-api.mongodb.com/prod";
const listProjectsUrl = new URL("projects", apiBaseUrl);
const response = await fetch(listProjectsUrl);
if (!response.ok) {
throw new Error(`Failed to list projects: ${response.statusText}`);
}
const responseBody = await response.json();
if (!("data" in responseBody)) {
throw new Error("Invalid response body. Received:", responseBody);
}
const apiProjects = responseBody.data as SnootyProject[];
return apiProjects;
}

async function findUningestedDocsSites() {
const apiProjects = await listDocsProjectsFromApi();
const ingestableProjectNames = new Set(apiProjects.map((p) => p.project));
const ingestedProjectNames = new Set(snootyProjectConfig.map((p) => p.name));
// A project should be ingested if it's ingestable but not ingested yet
const uningestedProjects = Array.from(ingestableProjectNames)
.filter((x) => !ingestedProjectNames.has(x))
.reduce(
(acc, projectName) => {
const omitted = getOmittedProject(projectName);
const deprecated = omitted?.deprecated;
if (deprecated) {
acc.deprecated.push(projectName);
} else if (omitted) {
acc.omitted.push(`${omitted.name} :: ${omitted.note}`);
} else {
acc.ingestable.push(projectName);
}
return acc;
},
{ deprecated: [], omitted: [], ingestable: [] } as {
deprecated: string[];
omitted: string[];
ingestable: string[];
}
);
return uningestedProjects;
}

findUningestedDocsSites().then((projects) => {
console.log(projects);
});
4 changes: 4 additions & 0 deletions packages/scripts/src/turndown-plugin-gfm.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
declare module "turndown-plugin-gfm" {
import { Plugin as TurndownPlugin } from "turndown";
export function gfm(): TurndownPlugin;
}
6 changes: 5 additions & 1 deletion packages/scripts/tsconfig.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
{
"extends": "../../tsconfig.json",
"compilerOptions": {
"lib": ["ESNext", "DOM"],
"outDir": "./build"
},
"include": ["./src/**/*.ts"]
"include": [
"./src/**/*.ts",
"./src/**/*.d.ts"
]
}