From 2cd7c45b43549b8e8eef1606a43f22c2de389484 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Wed, 2 Aug 2023 18:37:23 -0400 Subject: [PATCH 1/4] Re-implement fix from #326 behind an environment variable flag Signed-off-by: Jesse Whitehouse --- CHANGELOG.md | 4 ++++ dbt/adapters/databricks/impl.py | 20 +++++++++++++++++++- tests/unit/test_adapter.py | 23 +++++++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb90832ba..a2554fe3e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ ## dbt-databricks 1.6.x (Release TBD) +### Features + +- Follow up: re-implement fix for issue where the show tables extended command is limited to 2048 characters. ([#326](https://github.com/databricks/dbt-databricks/pull/326)). Set `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` to `true` to enable this behaviour. + ## dbt-databricks 1.6.1 (August 2, 2023) ### Fixes diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 95e4cf5dd..197bcf15e 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -2,6 +2,7 @@ from contextlib import contextmanager from itertools import chain from dataclasses import dataclass +import os import re from typing import ( Any, @@ -79,6 +80,22 @@ def check_not_found_error(errmsg: str) -> bool: return new_error or old_error is not None +def get_identifier_list_string(table_names: set[str]) -> str: + """Returns "|".join(table_names) by default. + + Returns "*" if DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS == "true" + and the joined string exceeds 2048 characters + + This is for AWS Glue Catalog users See issue #325. + """ + + _identifier = "|".join(table_names) + bypass_2048_char_limit = os.environ.get("DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS", "false") + if bypass_2048_char_limit == "true": + _identifier = _identifier if len(_identifier) < 2048 else "*" + return _identifier + + @undefined_proof class DatabricksAdapter(SparkAdapter): Relation = DatabricksRelation @@ -448,11 +465,12 @@ def _get_one_catalog( table_names.add(relation.identifier) columns: List[Dict[str, Any]] = [] + if len(table_names) > 0: schema_relation = self.Relation.create( database=database, schema=schema, - identifier="|".join(table_names), + identifier=get_identifier_list_string(table_names), quote_policy=self.config.quoting, ) for relation, information in self._list_relations_with_information(schema_relation): diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index 3cb49a910..d4043f5c7 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -8,6 +8,7 @@ from dbt.adapters.databricks import __version__ from dbt.adapters.databricks import DatabricksAdapter, DatabricksRelation from dbt.adapters.databricks.impl import check_not_found_error +from dbt.adapters.databricks.impl import get_identifier_list_string from dbt.adapters.databricks.connections import ( CATALOG_KEY_IN_SESSION_PROPERTIES, DBT_DATABRICKS_INVOCATION_ENV, @@ -947,6 +948,28 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel }, ) + def test_describe_table_extended_2048_char_limit(self): + """GIVEN a list of table_names whos total character length exceeds 2048 characters + WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true" + THEN the identifier list is replaced with "*" + """ + + table_names: set(str) = set([f"customers_{i}" for i in range(200)]) + + # By default, don't limit the number of characters + self.assertEqual(get_identifier_list_string(table_names), "|".join(table_names)) + + # If environment variable is set, then limit the number of characters + with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}): + + # Long list of table names is capped + self.assertEqual(get_identifier_list_string(table_names), "*") + + # Short list of table names is not capped + self.assertEqual( + get_identifier_list_string(list(table_names)[:5]), "|".join(list(table_names)[:5]) + ) + class TestCheckNotFound(unittest.TestCase): def test_prefix(self): From 75300ec390449250111b34dd0be54bab946d8dd7 Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Wed, 2 Aug 2023 18:39:53 -0400 Subject: [PATCH 2/4] Improve docstring Signed-off-by: Jesse Whitehouse --- dbt/adapters/databricks/impl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 197bcf15e..85f794c22 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -81,12 +81,12 @@ def check_not_found_error(errmsg: str) -> bool: def get_identifier_list_string(table_names: set[str]) -> str: - """Returns "|".join(table_names) by default. + """Returns `"|".join(table_names)` by default. - Returns "*" if DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS == "true" + Returns `"*"` if `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` == `"true"` and the joined string exceeds 2048 characters - This is for AWS Glue Catalog users See issue #325. + This is for AWS Glue Catalog users. See issue #325. """ _identifier = "|".join(table_names) From bdc10d892d6d599bec8f515ba1aa259ea2c94a2d Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Wed, 2 Aug 2023 18:58:18 -0400 Subject: [PATCH 3/4] Add compatibility for Python 3.8 type checks Signed-off-by: Jesse Whitehouse --- dbt/adapters/databricks/impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/databricks/impl.py b/dbt/adapters/databricks/impl.py index 85f794c22..38d8488f1 100644 --- a/dbt/adapters/databricks/impl.py +++ b/dbt/adapters/databricks/impl.py @@ -80,7 +80,7 @@ def check_not_found_error(errmsg: str) -> bool: return new_error or old_error is not None -def get_identifier_list_string(table_names: set[str]) -> str: +def get_identifier_list_string(table_names: Set[str]) -> str: """Returns `"|".join(table_names)` by default. Returns `"*"` if `DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS` == `"true"` From 2fb7247f05f22c198120852647e7ee66258213ff Mon Sep 17 00:00:00 2001 From: Jesse Whitehouse Date: Wed, 2 Aug 2023 19:11:52 -0400 Subject: [PATCH 4/4] Feedback: split test conditions into multiple methods Signed-off-by: Jesse Whitehouse --- tests/unit/test_adapter.py | 41 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py index d4043f5c7..7772b6598 100644 --- a/tests/unit/test_adapter.py +++ b/tests/unit/test_adapter.py @@ -970,6 +970,47 @@ def test_describe_table_extended_2048_char_limit(self): get_identifier_list_string(list(table_names)[:5]), "|".join(list(table_names)[:5]) ) + def test_describe_table_extended_should_not_limit(self): + """GIVEN a list of table_names whos total character length exceeds 2048 characters + WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is not set + THEN the identifier list is not truncated + """ + + table_names: set(str) = set([f"customers_{i}" for i in range(200)]) + + # By default, don't limit the number of characters + self.assertEqual(get_identifier_list_string(table_names), "|".join(table_names)) + + def test_describe_table_extended_should_limit(self): + """GIVEN a list of table_names whos total character length exceeds 2048 characters + WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true" + THEN the identifier list is replaced with "*" + """ + + table_names: set(str) = set([f"customers_{i}" for i in range(200)]) + + # If environment variable is set, then limit the number of characters + with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}): + + # Long list of table names is capped + self.assertEqual(get_identifier_list_string(table_names), "*") + + def test_describe_table_extended_may_limit(self): + """GIVEN a list of table_names whos total character length does not 2048 characters + WHEN the environment variable DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS is "true" + THEN the identifier list is not truncated + """ + + table_names: set(str) = set([f"customers_{i}" for i in range(200)]) + + # If environment variable is set, then we may limit the number of characters + with mock.patch.dict("os.environ", **{"DBT_DESCRIBE_TABLE_2048_CHAR_BYPASS": "true"}): + + # But a short list of table names is not capped + self.assertEqual( + get_identifier_list_string(list(table_names)[:5]), "|".join(list(table_names)[:5]) + ) + class TestCheckNotFound(unittest.TestCase): def test_prefix(self):