Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Track data-diff usage in MotherDuck #800

Merged
merged 14 commits into from
Dec 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ jobs:
DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
MOTHERDUCK_TOKEN: '${{ secrets.MOTHERDUCK_TOKEN }}'
run: |
chmod +x tests/waiting_for_stack_up.sh
./tests/waiting_for_stack_up.sh && TEST_ACROSS_ALL_DBS=0 poetry run unittest-parallel -j 16
1 change: 1 addition & 0 deletions .github/workflows/ci_full.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
# DATADIFF_BIGQUERY_URI: '${{ secrets.DATADIFF_BIGQUERY_URI }}'
DATADIFF_REDSHIFT_URI: '${{ secrets.DATADIFF_REDSHIFT_URI }}'
MOTHERDUCK_TOKEN: '${{ secrets.MOTHERDUCK_TOKEN }}'
run: |
chmod +x tests/waiting_for_stack_up.sh
./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16
16 changes: 15 additions & 1 deletion data_diff/databases/duckdb.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, ClassVar, Dict, Union, Type

import attrs
from packaging.version import parse as parse_version

from data_diff.utils import match_regexps
from data_diff.abcs.database_types import (
Expand All @@ -27,6 +28,7 @@
CHECKSUM_OFFSET,
)
from data_diff.databases.base import MD5_HEXDIGITS, CHECKSUM_HEXDIGITS
from data_diff.version import __version__


@import_helper("duckdb")
Expand Down Expand Up @@ -148,9 +150,21 @@ def close(self):
def create_connection(self):
ddb = import_duckdb()
try:
return ddb.connect(self._args["filepath"])
# custom_user_agent is only available in duckdb >= 0.9.2
if parse_version(ddb.__version__) >= parse_version("0.9.2"):
custom_user_agent = f"data-diff/v{__version__}"
config = {"custom_user_agent": custom_user_agent}
connection = ddb.connect(database=self._args["filepath"], config=config)
custom_user_agent_results = connection.sql("PRAGMA USER_AGENT;").fetchall()
custom_user_agent_filtered = custom_user_agent_results[0][0]
assert custom_user_agent in custom_user_agent_filtered
else:
connection = ddb.connect(database=self._args["filepath"])
return connection
except ddb.OperationalError as e:
raise ConnectError(*e.args) from e
except AssertionError:
raise ConnectError("Assertion failed: Custom user agent is invalid.") from None

def select_table_schema(self, path: DbPath) -> str:
database, schema, table = self._normalize_table_path(path)
Expand Down
2,020 changes: 1,068 additions & 952 deletions poetry.lock

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ psycopg2 = "*"
snowflake-connector-python = ">=3.0.2,<4.0.0"
cryptography = "*"
trino = "^0.314.0"
presto-python-client = "*"
presto-python-client = "0.8.3"
clickhouse-driver = "*"
vertica-python = "*"
duckdb = "^0.7.0"
duckdb = "^0.9.0"
dbt-core = "^1.0.0"
ruff = "^0.1.4"
# google-cloud-bigquery = "*"
Expand Down
Binary file modified tests/dbt_artifacts/jaffle_shop.duckdb
Binary file not shown.
7 changes: 7 additions & 0 deletions tests/dbt_artifacts/motherduck/profiles.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
jaffle_shop:
target: dev_motherduck
outputs:
dev_motherduck:
type: duckdb
path: 'md:jaffle_shop?motherduck_token={{ env_var("MOTHERDUCK_TOKEN") }}'
schema: dev
4 changes: 0 additions & 4 deletions tests/dbt_artifacts/profiles.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,3 @@ jaffle_shop:
type: duckdb
path: "./tests/dbt_artifacts/jaffle_shop.duckdb"
schema: dev
different_dev:
type: duckdb
path: "./tests/dbt_artifacts/jaffle_shop.duckdb"
schema: "{{ env_var('some_env_var') }}"
18 changes: 18 additions & 0 deletions tests/test_dbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,24 @@ def test_integration_basic_dbt(self):
# 1 with a diff
assert diff_string.count(" Rows Added Rows Removed") == 1

def test_integration_motherduck_dbt(self):
artifacts_path = os.getcwd() + "/tests/dbt_artifacts"
test_project_path = os.environ.get("DATA_DIFF_DBT_PROJ") or artifacts_path
test_profiles_path = os.environ.get("DATA_DIFF_DBT_PROJ") or artifacts_path + "/motherduck"
diff = run_datadiff_cli(
"--dbt", "--dbt-project-dir", test_project_path, "--dbt-profiles-dir", test_profiles_path
)

# assertions for the diff that exists in tests/dbt_artifacts/jaffle_shop.duckdb
if test_project_path == artifacts_path:
diff_string = b"".join(diff).decode("utf-8")
# 5 diffs were ran
assert diff_string.count("<>") == 5
# 4 with no diffs
assert diff_string.count("No row differences") == 4
# 1 with a diff
assert diff_string.count(" Rows Added Rows Removed") == 1

def test_integration_cloud_dbt(self):
project_dir = os.environ.get("DATA_DIFF_DBT_PROJ")
if project_dir is not None:
Expand Down