Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

refactor(core): minor internal changes #18

Merged
merged 7 commits into from
Dec 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions bookworm_genai/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ def main():
logger.info("[bold green]Starting Bookworm 📖")
logger.debug("Running on platform '%s'", sys.platform)

argparser = argparse.ArgumentParser(description="Bookworm - A LLM-powered bookmark search engine")
arg_parser = argparse.ArgumentParser(description="LLM-powered bookmark search engine")

sub_parsers = argparser.add_subparsers(dest="command", help="Available commands", required=True)
sub_parsers = arg_parser.add_subparsers(dest="command", help="Available commands", required=True)

sync_parser = sub_parsers.add_parser("sync", help="Sync the bookmark database with the latest changes")
sync_parser.add_argument("--estimate-cost", action="store_true", default=False, help="Estimate the cost of syncing the bookmark database")
Expand All @@ -25,7 +25,7 @@ def main():
ask_parser.add_argument("-n", "--top-n", type=int, default=3, help="Number of bookmarks to return")
ask_parser.add_argument("-q", "--query", help="The Search Query")

args = argparser.parse_args(sys.argv[1:])
args = arg_parser.parse_args(sys.argv[1:])

logger.debug("Arguments: %s", args)

Expand Down
41 changes: 33 additions & 8 deletions bookworm_genai/commands/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
import glob
import logging
import shutil
from typing import Optional, Union

import tiktoken
from langchain_core.documents import Document

from bookworm_genai.integrations import Browser
from bookworm_genai.integrations import Browser, browsers, BrowserManifest
from bookworm_genai.storage import store_documents, _get_embedding_store


logger = logging.getLogger(__name__)


def sync(browsers: dict, estimate_cost: bool = False, browser_filter: list[str] = []):
def sync(browsers: BrowserManifest = browsers, estimate_cost: bool = False, browser_filter: list[str] = []) -> Union[None, float]:
docs: list[Document] = []

for browser, config in browsers.items():
Expand All @@ -31,8 +32,11 @@ def sync(browsers: dict, estimate_cost: bool = False, browser_filter: list[str]
continue
else:
if "copy" in platform_config:
_copy(platform_config["copy"])

try:
_copy(platform_config["copy"])
except BrowserBookmarkFileNotFound as e:
logger.warning(f"🔄 browser {browser.value} skipped due to missing file '{e.file}'")
continue

_log_bookmark_source(browser, platform_config)

Expand All @@ -54,11 +58,18 @@ def sync(browsers: dict, estimate_cost: bool = False, browser_filter: list[str]
store_documents(docs)




def _copy(config: dict):
logger.debug(f"Copying {config['from']} to {config['to']}")

source = glob.glob(config["from"])
source = source[0]

try:
source = source[0]
except IndexError as e:
logger.debug(f"source {config['from']} not found")
raise BrowserBookmarkFileNotFound(config["from"]) from e

directory = os.path.dirname(config["to"])
os.makedirs(directory, exist_ok=True)
Expand Down Expand Up @@ -89,10 +100,10 @@ def _log_bookmark_source(browser: Browser, platform_config: dict):
logger.debug("Loading bookmarks from %s", path)


def _estimate_cost(docs: list[Document]) -> float:
def _estimate_cost(docs: list[Document], cost_per_million: Optional[float] = None) -> float:
embedding = _get_embedding_store()

# using _get_embedding_store here means that it's more likely that the model we are using
# NOTE: using _get_embedding_store here means that it's more likely that the model we are using
# in the actual embedding is the one we use for cost estimation
# however note that .model here is not part of the contract for Embeddings
# so this is a bit of a hack
Expand All @@ -105,7 +116,11 @@ def _estimate_cost(docs: list[Document]) -> float:
for doc in docs:
tokens += len(encoding.encode(doc.page_content))

price = float(input(f"what is the current cost for {embedding.model} per million? (non-batch) "))
if not cost_per_million:
# https://openai.com/api/#/
price = float(input(f"what is the current cost for {embedding.model} per million? (non-batch) "))
else:
price = cost_per_million

# price is often advertise per million; so find the price per token
price_per_token = price / 1_000_000
Expand All @@ -116,3 +131,13 @@ def _estimate_cost(docs: list[Document]) -> float:
logger.info(f"Estimated cost: ${cost} (tokens: {tokens}) ")

return cost


class BrowserBookmarkFileNotFound(Exception):
'''
Represents that a bookmark file on the local file system could not be found.
For example if a configuration is defined with a glob expression /my/path/*.sqlite but that path resolves to nothing.
'''
def __init__(self, file: str):
self.file = file
super().__init__(f"Could not resolve file: {file}")
6 changes: 5 additions & 1 deletion bookworm_genai/integrations.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os
from enum import Enum
from typing import Any

from langchain_community.document_loaders import JSONLoader
from langchain_community.document_loaders.sql_database import SQLDatabaseLoader
from langchain_community.utilities.sql_database import SQLDatabase
Expand All @@ -23,13 +25,15 @@ def list(cls):
select(.type == "url")
"""

BrowserManifest = dict[Browser, dict[str, dict[str, Any]]]

# Configuration for various browsers and details about them
# The bookmark_file_path is the path to the bookmarks file for the browsers, in order for it to be used it must be used in conjunction with
# os.path.expanduser as it may contain environment variables
#
# The platform configuration is keyed off the values from https://docs.python.org/3/library/sys.html#sys.platform
#
browsers = {
browsers: BrowserManifest = {
Browser.BRAVE: {
"linux": {
"bookmark_loader": JSONLoader,
Expand Down
7 changes: 4 additions & 3 deletions bookworm_genai/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from platformdirs import PlatformDirs
from langchain_community.vectorstores import DuckDB as DuckDBVectorStore
from langchain_community.vectorstores.duckdb import DEFAULT_TABLE_NAME
from langchain_core.documents import Document
from langchain_core.embeddings.embeddings import Embeddings
from langchain_openai.embeddings import OpenAIEmbeddings
Expand All @@ -18,10 +19,10 @@ def store_documents(docs: list[Document]):

logger.info(f"vectorizing and storing {len(docs)} documents into {full_database_path}")
with duckdb.connect(full_database_path) as conn:
logger.debug("dropping existing embeddings table if exists")
conn.execute("DROP TABLE IF EXISTS embeddings")
logger.debug(f"dropping existing embeddings table '{DEFAULT_TABLE_NAME}' if exists")
conn.execute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE_NAME}")

logger.debug("loading documents")
logger.debug(f"loading {len(docs)} documents")
DuckDBVectorStore.from_documents(docs, embeddings, connection=conn)


Expand Down
66 changes: 64 additions & 2 deletions tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from bookworm_genai.commands.sync import sync
from bookworm_genai.commands.sync import _estimate_cost, sync
from bookworm_genai.integrations import Browser, browsers


Expand Down Expand Up @@ -209,6 +209,24 @@ def test_sync_estimate_cost(

assert cost == 0.0005700000000000001

@patch.dict(os.environ, {"OPENAI_API_KEY": "secret"}, clear=True)
@patch("builtins.input")
@patch("bookworm_genai.commands.sync.tiktoken")
def test_sync_estimate_cost_non_interactive(mock_tiktoken: Mock, mock_input: Mock):
mocked_documents = [
Mock(page_content="mocked_page_content"),
]

mock_encoding = Mock()
mock_encoding.encode.return_value = "mocked_page_content" * 100 # The multiplier just simulates a larger document
mock_tiktoken.encoding_for_model.return_value = mock_encoding

cost = _estimate_cost(mocked_documents, cost_per_million=0.100)

assert cost == 0.00019
assert not mock_input.called


@patch("bookworm_genai.commands.sync.glob")
@patch("bookworm_genai.commands.sync.shutil")
@patch("bookworm_genai.commands.sync.os.makedirs")
Expand All @@ -230,4 +248,48 @@ def test_sync_browser_filter(
sync(browsers, browser_filter=browser_filter)

assert browsers[Browser.CHROME][platform]['bookmark_loader'].called
assert not browsers[Browser.FIREFOX][platform]['bookmark_loader'].called
assert not browsers[Browser.FIREFOX][platform]['bookmark_loader'].called



@patch('bookworm_genai.commands.sync.store_documents')
@patch('bookworm_genai.commands.sync.os')
@patch('bookworm_genai.commands.sync.shutil')
@patch('bookworm_genai.commands.sync.glob')
def test_sync_copy_source_missing(mock_glob: Mock, mock_shutil: Mock, mock_os: Mock, mock_store_documents: Mock):

path_to_missing_file = "/path/to/missing/file"

mock_docs_loader = Mock()
mock_docs_loader.return_value.lazy_load.return_value = ["DOC1", "DOC2"]

browsers = {
# this one will fail and be skipped due to missing file
# ensure that even if this one fails, the next one will still be processed
Browser.FIREFOX: {
sys.platform: {
"bookmark_loader": Mock(),
"bookmark_loader_kwargs": {},
"copy": {
"from": path_to_missing_file,
"to": "/path/to/destination",
},
}
},
# this one will be processed
Browser.CHROME: {
sys.platform: {
"bookmark_loader": mock_docs_loader,
"bookmark_loader_kwargs": {},
}
},
}

mock_glob.glob.return_value = []

sync(browsers=browsers)

mock_glob.glob.assert_called_once_with(path_to_missing_file)

# ensures that even if the first browser fails, the second one still extracts docs and submits to storage
assert mock_store_documents.call_args_list == [call(["DOC1", "DOC2"])]
Loading