diff --git a/latest_requirements.txt b/latest_requirements.txt index 0e81bab43..9c7939ee0 100644 --- a/latest_requirements.txt +++ b/latest_requirements.txt @@ -3,9 +3,9 @@ copulas==0.11.1 ctgan==0.10.2 deepecho==0.6.1 graphviz==0.20.3 -numpy==1.26.4 +numpy==2.0.2 pandas==2.2.3 platformdirs==4.3.6 rdt==1.13.0 sdmetrics==0.16.0 -tqdm==4.66.5 +tqdm==4.67.0 diff --git a/sdv/data_processing/data_processor.py b/sdv/data_processing/data_processor.py index f3e0ef158..0d43e6ebe 100644 --- a/sdv/data_processing/data_processor.py +++ b/sdv/data_processing/data_processor.py @@ -22,13 +22,12 @@ ) from sdv.data_processing.datetime_formatter import DatetimeFormatter from sdv.data_processing.errors import InvalidConstraintsError, NotFittedError -from sdv.data_processing.numerical_formatter import INTEGER_BOUNDS, NumericalFormatter +from sdv.data_processing.numerical_formatter import NumericalFormatter from sdv.data_processing.utils import load_module_from_path from sdv.errors import SynthesizerInputError, log_exc_stacktrace from sdv.metadata.single_table import SingleTableMetadata LOGGER = logging.getLogger(__name__) -INTEGER_BOUNDS = {str(key).lower(): value for key, value in INTEGER_BOUNDS.items()} class DataProcessor: @@ -70,8 +69,6 @@ class DataProcessor: 'M': 'datetime', } - _COLUMN_RELATIONSHIP_TO_TRANSFORMER = {'address': 'RandomLocationGenerator', 'gps': 'GPSNoiser'} - def _update_numerical_transformer(self, enforce_rounding, enforce_min_max_values): custom_float_formatter = rdt.transformers.FloatFormatter( missing_value_replacement='mean', @@ -124,6 +121,10 @@ def __init__( self._constraints = [] self._constraints_to_reverse = [] self._custom_constraint_classes = {} + self._COLUMN_RELATIONSHIP_TO_TRANSFORMER = { + 'address': 'RandomLocationGenerator', + 'gps': 'GPSNoiser', + } self._transformers_by_sdtype = deepcopy(get_default_transformers()) self._transformers_by_sdtype['id'] = rdt.transformers.RegexGenerator() @@ -575,11 +576,11 @@ def _create_config(self, data, columns_created_by_constraints): if is_numeric: function_name = 'random_int' column_dtype = str(column_dtype).lower() - function_kwargs = {'min': 0, 'max': 9999999} - for key in INTEGER_BOUNDS: - if key in column_dtype: - _, max_value = INTEGER_BOUNDS[key] - function_kwargs = {'min': 0, 'max': max_value} + function_kwargs = {'min': 0, 'max': 16777216} + if 'int8' in column_dtype: + function_kwargs['max'] = 127 + elif 'int16' in column_dtype: + function_kwargs['max'] = 32767 else: function_kwargs = {'text': 'sdv-id-??????'} diff --git a/tests/integration/single_table/test_copulas.py b/tests/integration/single_table/test_copulas.py index af6ac7234..96250ca67 100644 --- a/tests/integration/single_table/test_copulas.py +++ b/tests/integration/single_table/test_copulas.py @@ -348,16 +348,16 @@ def test_numerical_columns_gets_pii(): # Assert expected_sampled = pd.DataFrame({ 'id': [ - 1089619006166876142, - 8373046707753416652, - 9070705361670139280, - 7227045982112645011, - 3461931576753619633, - 1005734164466301683, - 3312031189447929384, - 82456842876428117, - 1819741328868365520, - 8019169766233150107, + 1982005, + 15967014, + 10406639, + 15230483, + 14028549, + 16499516, + 9244156, + 13145920, + 10106629, + 6297216, ], 'city': [ 'Danielfort', diff --git a/tests/unit/data_processing/test_data_processor.py b/tests/unit/data_processing/test_data_processor.py index f9bd39c75..e591d532c 100644 --- a/tests/unit/data_processing/test_data_processor.py +++ b/tests/unit/data_processing/test_data_processor.py @@ -1258,7 +1258,7 @@ def test__create_config(self): assert id_numeric_int_32_transformer.function_name == 'random_int' assert id_numeric_int_32_transformer.function_kwargs == { 'min': 0, - 'max': 2147483647, + 'max': 16777216, } id_column_transformer = config['transformers']['id_column']