Skip to content

Commit

Permalink
move main ingest logic to core lib + mdb specific stuff to our implem…
Browse files Browse the repository at this point in the history
…entation
  • Loading branch information
mongodben committed Aug 27, 2024
1 parent cec769c commit ef74536
Show file tree
Hide file tree
Showing 36 changed files with 49 additions and 51 deletions.
1 change: 1 addition & 0 deletions packages/ingest-mongodb-public/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
"dependencies": {
"@release-it/bumper": "^5.1.0",
"dotenv": "^16",
"langchain": "^0.2.17",
"mongodb-rag-core": "*",
"mongodb-rag-ingest": "*",
"striptags": "^3.2.0"
Expand Down
2 changes: 1 addition & 1 deletion packages/ingest-mongodb-public/src/meta.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import {
AzureKeyCredential,
} from "mongodb-rag-core";
import { snootyDataApiBaseUrl } from "./sources/snooty";
import { makeSnootyDataSource } from "mongodb-rag-ingest/sources/snooty";
import { makeSnootyDataSource } from "./sources/snooty/SnootyDataSource";

const {
OPENAI_ENDPOINT,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import { strict as assert } from "assert";
import { convert } from "html-to-text";
import { removeMarkdownImagesAndLinks } from "./removeMarkdownImagesAndLinks";
import { DataSource } from "./DataSource";
import { ProjectBase } from "./ProjectBase";
import { removeMarkdownImagesAndLinks } from "mongodb-rag-core/ingest";
import { DataSource } from "mongodb-rag-core/ingest";
import { ProjectBase } from "mongodb-rag-core/ingest";
import { MongoClient } from "mongodb";
import { assertEnvVars } from "../../assertEnvVars";
import { Page } from "../../Page";
import { logger } from "../../services/logger";
import { assertEnvVars } from "mongodb-rag-core";
import { Page } from "mongodb-rag-core";
import { logger } from "mongodb-rag-core";

export type DevCenterProjectConfig = ProjectBase & {
type: "devcenter";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ import { snootyAstToMd } from "./snooty/snootyAstToMd";
import {
MakeGitHubDataSourceArgs,
makeGitHubDataSource,
} from "./GitHubDataSource";
import { extractMarkdownH1 } from "./extractMarkdownH1";
} from "mongodb-rag-core/ingest";
import { extractMarkdownH1 } from "mongodb-rag-core/ingest";

/**
Loads an rST docs site from a GitHub repo.
Expand Down
18 changes: 10 additions & 8 deletions packages/ingest-mongodb-public/src/sources/index.ts
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
import { strict as assert } from "assert";
import { Page, extractFrontMatter } from "mongodb-rag-core";
import {
DataSource,
makeDevCenterDataSource,
DevCenterProjectConfig,
makeGitDataSource,
HandleHtmlPageFuncOptions,
handleHtmlDocument,
MakeMdOnGithubDataSourceParams,
Page,
extractFrontMatter,
makeGitDataSource,
makeMdOnGithubDataSource,
removeMarkdownImagesAndLinks,
} from "mongodb-rag-core";
import {
makeDevCenterDataSource,
DevCenterProjectConfig,
} from "./DevCenterDataSource";
import {
MakeMongoDbUniversityDataSourceParams,
makeMongoDbUniversityDataSource,
filterOnlyPublicActiveTiCatalogItems,
} from "mongodb-rag-ingest/sources";
import { prepareSnootySources } from "mongodb-rag-ingest/sources/snooty";
} from "./mongodb-university";
import { prismaSourceConstructor } from "./prisma";
import { wiredTigerSourceConstructor } from "./wiredTiger";
import { mongooseSourceConstructor } from "./mongoose";
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import { PageMetadata } from "../../../Page";
import { DataSource } from "../DataSource";
import { PageMetadata, DataSource } from "mongodb-rag-core";
import { makeUniversityPages } from "./makeUniversityPages";
import {
TiCatalogItem,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { PageMetadata, Page } from "../../../Page";
import { PageMetadata, Page } from "mongodb-rag-core";
import {
TiCatalogItem,
UniversityVideo,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import { createInterface } from "readline";
import fetch from "node-fetch";
import { DataSource } from "../DataSource";
import { DataSource } from "mongodb-rag-core/ingest";
import { snootyAstToMd, getTitleFromSnootyAst } from "./snootyAstToMd";
import { ProjectBase } from "../ProjectBase";
import { ProjectBase } from "mongodb-rag-core/ingest";
import {
getTitleFromSnootyOpenApiSpecAst,
snootyAstToOpenApiSpec,
} from "./snootyAstToOpenApiSpec";
import { Page } from "../../../Page";
import { PageFormat } from "../../../PageFormat";
import { logger } from "../../../services/logger";
import { Page } from "mongodb-rag-core";
import { PageFormat } from "mongodb-rag-core";
import { logger } from "mongodb-rag-core";

// These types are what's in the snooty manifest jsonl file.
export type SnootyManifestEntry = {
Expand All @@ -28,10 +28,10 @@ export type SnootyPageEntry = SnootyManifestEntry & {
/**
Represents metadata in a Snooty manifest file.
*/
export type SnootyMetadataEntry = SnootyManifestEntry & {
type: "metadata";
data: {title?: string};
};
export type SnootyMetadataEntry = SnootyManifestEntry & {
type: "metadata";
data: { title?: string };
};

/**
A node in the Snooty AST.
Expand Down Expand Up @@ -65,9 +65,9 @@ export type SnootyPageData = {
/**
A Snooty Data API metadata object. This contains project-level information, such as the site name.
*/
export type SnootyMetadata = {
title?: string;
};
export type SnootyMetadata = {
title?: string;
};

export type SnootyProjectConfig = ProjectBase & {
type: "snooty";
Expand Down Expand Up @@ -168,7 +168,7 @@ export const makeSnootyDataSource = ({
const stream = createInterface(body);
const linePromises: Promise<void>[] = [];
const pages: Page[] = [];
let siteTitle: string | undefined = undefined
let siteTitle: string | undefined = undefined;
await new Promise<void>((resolve, reject) => {
stream.on("line", async (line) => {
const entry = JSON.parse(line) as SnootyManifestEntry;
Expand Down Expand Up @@ -231,11 +231,11 @@ export const makeSnootyDataSource = ({
});
await Promise.allSettled(linePromises);
// add metadata to all the pages
for(const page of pages){
for (const page of pages) {
if (!page.metadata) {
page.metadata = {}
page.metadata = {};
}
page.metadata.siteTitle = siteTitle
page.metadata.siteTitle = siteTitle;
}
return pages;
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ import {
Branch,
LocallySpecifiedSnootyProjectConfig,
} from "./SnootyDataSource";
import { filterFulfilled } from "../../../arrayFilters";
import { logger } from "../../../services/logger";
import { filterFulfilled, logger } from "mongodb-rag-core";

/** Schema for API response from https://snooty-data-api.mongodb.com/prod/projects */
export type GetSnootyProjectsResponse = {
Expand Down
4 changes: 2 additions & 2 deletions packages/mongodb-rag-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,13 @@
"build",
"README.md"
],
"main": "./build/index.js",
"exports": {
".": "./build/index.js",
"./ingest": "./build/ingest/index.js",
"./ingest/embed": "./build/embed/index.js",
"./ingest/pages": "./build/pages/index.js",
"./ingest/sources": "./build/sources/index.js",
"./ingest/sources/snooty": "./build/sources/snooty/index.js"
"./ingest/sources": "./build/sources/index.js"
},
"scripts": {
"clean": "rm -rf build",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ import {
SomeTokenizer,
} from "./chunkPage";
import { Page } from "../../Page";
import { logger } from "../../services/logger";
import { updateFrontMatter } from "../../updateFrontMatter";
import { logger } from "../../logger";

export const defaultOpenApiSpecYamlChunkOptions: ChunkOptions = {
maxChunkSize: 1250,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { chunkPage, ChunkFunc, ChunkOptions } from "./chunkPage";
import { EmbeddedContentStore, EmbeddedContent } from "../../EmbeddedContent";
import { Embedder } from "../../Embedder";
import { PageStore, PersistedPage } from "../../Page";
import { logger } from "../../services/logger";
import { logger } from "../../logger";

/**
(Re-)embeddedContent the pages in the page store that have changed since the given date
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import deepEqual from "deep-equal";
import { PersistedPage, Page } from "../../Page";
import { logger } from "../../services/logger";
import { logger } from "../../logger";

/**
Given sets of old and new pages, returns the pages that need to be created,
Expand Down
2 changes: 1 addition & 1 deletion packages/mongodb-rag-core/src/ingest/pages/updatePages.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { getChangedPages } from "./getChangedPages";
import { DataSource } from "../sources/DataSource";
import { PageStore, Page } from "../../Page";
import { logger } from "../../services/logger";
import { logger } from "../../logger";

/**
Fetches pages from data sources and stores those that have changed in the data
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ import acquit from "acquit";
import { removeMarkdownImagesAndLinks } from "./removeMarkdownImagesAndLinks";
import { extractMarkdownH1 } from "./extractMarkdownH1";
import { PageMetadata, Page } from "../../Page";
import { logger } from "../../services/logger";
import { logger } from "../../logger";

/**
Loads an MD/Acquit docs site from a GitHub repo.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import { rimrafSync } from "rimraf";
import { DataSource } from "./DataSource";
import { filterDefined, filterFulfilled } from "../../arrayFilters";
import { Page, PageMetadata } from "../../Page";
import { logger } from "../../services/logger";
import { logger } from "../../logger";

/**
Function to convert a file in the repo into a `Page` or `Page[]`.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import TurndownService from "turndown";
import * as turndownPluginGfm from "turndown-plugin-gfm";
import { JSDOM } from "jsdom";
import { logger } from "../../services/logger";
import { logger } from "../../logger";
import { PageMetadata, Page } from "../../Page";
export type HandleHtmlPageFuncOptions = {
/** Returns an array of DOM elements to be removed from the parsed document. */
Expand Down
5 changes: 1 addition & 4 deletions packages/mongodb-rag-core/src/ingest/sources/index.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
export * from "./AcquitRequireMdOnGithubDataSource";
export * from "./DataSource";
export * from "./DevCenterDataSource";
export * from "./extractMarkdownH1";
export * from "./GitDataSource";
export * from "./GitHubDataSource";
export * from "./MdOnGithubDataSource";
export * from "./ProjectBase";
export * from "./RstOnGitHubDataSource";
export * from "./handleHtmlDocument";
export * from "./removeMarkdownImagesAndLinks";
export * from "./snooty";
export * from "./mongodb-university";
export * from "./LangchainDocumentLoaderDataSource";
export * from "./CodeOnGithubTextDataSource";

0 comments on commit ef74536

Please # to comment.