diff --git a/fedot/core/data/data.py b/fedot/core/data/data.py index c869f6bf95..db94286819 100644 --- a/fedot/core/data/data.py +++ b/fedot/core/data/data.py @@ -124,8 +124,7 @@ def from_csv_time_series(cls, if isinstance(task, str): task = Task(TaskTypesEnum(task)) - df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop, - parse_index_as_datetime=True) + df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_drop=columns_to_drop) idx = df.index.to_numpy() if target_column is not None: @@ -185,8 +184,7 @@ def from_csv_multi_time_series(cls, An instance of :class:`InputData`. """ - df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use, - parse_index_as_datetime=True) + df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use) idx = df.index.to_numpy() if columns_to_use is not None: actual_df = df[columns_to_use] @@ -605,11 +603,20 @@ def autodetect_data_type(task: Task) -> DataTypesEnum: def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Union[str, int]] = None, - possible_idx_keywords: Optional[List[str]] = None, - *, columns_to_drop: Optional[List[Union[str, int]]] = None, - columns_to_use: Optional[List[Union[str, int]]] = None, parse_index_as_datetime: bool = False): - columns_to_drop = columns_to_drop or [] - columns_to_use = columns_to_use or [] + possible_idx_keywords: Optional[List[str]] = None, *, + columns_to_drop: Optional[List[Union[str, int]]] = None, + columns_to_use: Optional[List[Union[str, int]]] = None): + + def define_index_column(candidate_columns: List[str]) -> Optional[str]: + for column_name in candidate_columns: + if is_column_name_suitable_for_index(column_name): + return column_name + + def is_column_name_suitable_for_index(column_name: str) -> bool: + return any(key in column_name.lower() for key in possible_idx_keywords) + + columns_to_drop = copy(columns_to_drop) or [] + columns_to_use = copy(columns_to_use) or [] possible_idx_keywords = possible_idx_keywords or [] logger = default_log('CSV data extraction') @@ -623,16 +630,16 @@ def get_df_from_csv(file_path: PathType, delimiter: str, index_col: Optional[Uni if columns_to_drop: columns_to_use = [col for col in columns if col not in columns_to_drop] elif not columns_to_use: - columns_to_use = columns + columns_to_use = list(columns) + candidate_idx_cols = [columns_to_use[0], columns[0]] if index_col is None: - first_column = columns_to_use[0] - if any(key in first_column.lower() for key in possible_idx_keywords): - logger.message(f'Used the column as index: "{first_column}".') - index_col = first_column + defined_index = define_index_column(candidate_idx_cols) + if define_index_column is not None: + index_col = defined_index + logger.message(f'Used the column as index: "{index_col}".') - df = pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use) + if (index_col is not None) and (index_col not in columns_to_use): + columns_to_use.append(index_col) - if parse_index_as_datetime and index_col: - df.index = pd.to_datetime(df.index).astype(str) - return df + return pd.read_csv(file_path, sep=delimiter, index_col=index_col, usecols=columns_to_use) diff --git a/fedot/core/data/multi_modal.py b/fedot/core/data/multi_modal.py index eb511da79a..fc26696b1e 100644 --- a/fedot/core/data/multi_modal.py +++ b/fedot/core/data/multi_modal.py @@ -194,8 +194,7 @@ def from_csv_time_series(cls, if isinstance(task, str): task = Task(TaskTypesEnum(task)) - df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use, - parse_index_as_datetime=True) + df = get_df_from_csv(file_path, delimiter, index_col, possible_idx_keywords, columns_to_use=columns_to_use) idx = df.index.to_numpy() if not columns_to_use: columns_to_use = list(set(df.columns) - set(index_col)) diff --git a/test/data/remote/remote_config_ts_multivar b/test/data/remote/remote_config_ts_multivar index 64a7206229..1956adb91a 100644 --- a/test/data/remote/remote_config_ts_multivar +++ b/test/data/remote/remote_config_ts_multivar @@ -3,7 +3,7 @@ pipeline_template = { "total_pipeline_operations": [ "data_source_ts/velocity train_data = {fedot_base_path}/test/data/multivar_ts.csv task = Task(TaskTypesEnum.ts_forecasting, TsForecastingParams(forecast_length=1)) output_path = ./test_ts_multivar -train_data_idx = ["2019-05-01 00:00:00","2019-05-02 00:00:00","2019-05-03 00:00:00","2019-05-04 00:00:00","2019-05-05 00:00:00","2019-05-06 00:00:00","2019-05-07 00:00:00","2019-05-08 00:00:00","2019-05-09 00:00:00","2019-05-10 00:00:00","2019-05-11 00:00:00","2019-05-12 00:00:00","2019-05-13 00:00:00","2019-05-14 00:00:00","2019-05-15 00:00:00"] +train_data_idx = ["2019-05-01","2019-05-02","2019-05-03","2019-05-04","2019-05-05","2019-05-06","2019-05-07","2019-05-08","2019-05-09","2019-05-10","2019-05-11","2019-05-12","2019-05-13","2019-05-14","2019-05-15"] var_names = ["diesel_fuel_kWh","wind_power_kWh","diesel_time_h","wind_time_h","velocity_max_msec","velocity_mean_msec","tmp_grad"] is_multi_modal = "True" target = "diesel_fuel_kWh"