Skip to content

Commit

Permalink
Nick:
Browse files Browse the repository at this point in the history
  • Loading branch information
nickscamara committed Apr 6, 2024
1 parent b6aed88 commit f7d4965
Show file tree
Hide file tree
Showing 5 changed files with 60 additions and 6 deletions.
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@mendable/data-connectors",
"version": "0.0.49-beta.2",
"version": "0.0.49-beta.6",
"description": "Data connectors for LLMs. Made by Mendable.ai",
"main": "dist/index.js",
"module": "dist/index.mjs",
Expand Down Expand Up @@ -82,6 +82,7 @@
"puppeteer": "^21.10.0",
"scrapingbee": "^1.7.4",
"tsup": "^8.0.1",
"turndown": "^7.1.3",
"xml2js": "^0.6.2",
"youtube-transcript": "^1.0.6"
},
Expand Down
13 changes: 13 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions src/example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { createDataConnector } from "./DataConnector";

async function test2(){

const a = createDataConnector({
provider: 'web-scraper',

})

await a.setOptions({
mode: 'single_urls',
urls: ['https://mendable.ai'],
});

const res = await a.getDocuments();
console.log(res);

}

test2();

2 changes: 1 addition & 1 deletion src/providers/WebScraper/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ export class WebScraperDataProvider implements DataProvider<WebScraperOptions> {
for (let i = 0; i < urls.length; i += this.concurrentRequests) {
const batchUrls = urls.slice(i, i + this.concurrentRequests);
await Promise.all(batchUrls.map(async (url, index) => {
const result = await scrapSingleUrl(url);
const result = await scrapSingleUrl(url, true);
processedUrls++;
if (inProgress) {
inProgress({
Expand Down
27 changes: 23 additions & 4 deletions src/providers/WebScraper/single_url.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ async function scrapWithScrapingBee(url: string): Promise<string | null> {
}
}

export async function scrapSingleUrl(urlToScrap: string): Promise<Document> {
export async function scrapSingleUrl(urlToScrap: string, toMarkdown: boolean = true): Promise<Document> {
urlToScrap = urlToScrap.trim();

try {
Expand All @@ -43,8 +43,20 @@ export async function scrapSingleUrl(urlToScrap: string): Promise<Document> {
}
content = res;
}
var TurndownService = require('turndown')

const turndownService = new TurndownService();
let markdownContent = '';
if (toMarkdown) {
markdownContent = turndownService.turndown(content);
}


const soup2 = cheerio.load(content);
const metadata = extractMetadata(soup2, urlToScrap);
const soup = cheerio.load(markdownContent);


const soup = cheerio.load(content);
soup("script, style, iframe, noscript").remove();
let formattedText = '';
soup('body').children().each(function() {
Expand All @@ -61,17 +73,24 @@ export async function scrapSingleUrl(urlToScrap: string): Promise<Document> {
});

const text = sanitizeText(formattedText.trim());
const metadata = extractMetadata(soup, urlToScrap);

if (metadata) {
console.log(markdownContent)
console.log("here", toMarkdown)
return {
content: text,
provider: "web-scraper",
metadata: { ...metadata, sourceURL: urlToScrap },
} as Document;
} else {
return {
content: text,
provider: "web-scraper",
metadata: { sourceURL: urlToScrap },
} as Document;
}
return {
content: text,
content: markdownContent,
provider: "web-scraper",
metadata: { sourceURL: urlToScrap },
} as Document;
Expand Down

0 comments on commit f7d4965

Please # to comment.