From 4fba6a951ddd0e222ebe8d17892e149b8e3ae773 Mon Sep 17 00:00:00 2001 From: americast Date: Thu, 5 Oct 2023 16:26:26 -0400 Subject: [PATCH 01/21] Add feedback for flat predictions --- .../source/reference/ai/model-forecasting.rst | 9 ++++--- evadb/functions/forecast.py | 26 ++++++++++++++++--- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index 8285ad76b6..0e6adf4f5c 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -28,16 +28,18 @@ Next, we create a function of `TYPE Forecasting`. We must enter the column name CREATE FUNCTION IF NOT EXISTS Forecast FROM (SELECT y FROM AirData) TYPE Forecasting + HORIZON 12 PREDICT 'y'; This trains a forecasting model. The model can be called by providing the horizon for forecasting. .. code-block:: sql - SELECT Forecast(12); + SELECT Forecast(); -Here, the horizon is `12`, which represents the forecast 12 steps into the future. +.. note:: + `Forecasting` function also provides suggestions by default. If you wish to turn it off, send "False" as an optional argument while calling the function. Eg. `SELECT Forecast("False");` Forecast Parameters ------------------- @@ -90,4 +92,5 @@ Below is an example query with `neuralforecast` with `trend` column as exogenous PREDICT 'y' LIBRARY 'neuralforecast' AUTO 'f' - FREQUENCY 'M'; \ No newline at end of file + FREQUENCY 'M'; + diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 1571f6c4fc..835b7cd127 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -48,13 +48,33 @@ def setup( self.id_column_rename = id_column_rename self.horizon = int(horizon) self.library = library + self.suggestion_dict = { + 1: "Predictions are flat. Consider using LIBRARY 'neuralforecast' for more accrate predictions.", + } def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": - forecast_df = self.model.predict(h=self.horizon) + forecast_df = self.model.predict(h=self.horizon).reset_index() else: - forecast_df = self.model.predict() - forecast_df.reset_index(inplace=True) + forecast_df = self.model.predict().reset_index() + + # Suggestions + if len(data) == 0 or list(data[0])[0].lower()[0] == "t": + suggestion_list = [] + # 1: Flat predictions + if self.library == "statsforecast": + for type_here in forecast_df["unique_id"].unique(): + if ( + forecast_df.loc[forecast_df["unique_id"] == type_here][ + self.model_name + ].nunique() + == 1 + ): + suggestion_list.append(1) + + for suggestion in set(suggestion_list): + print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) + forecast_df = forecast_df.rename( columns={ "unique_id": self.id_column_rename, From dbe6d0269f374f28b295ceb706cc695e603e815f Mon Sep 17 00:00:00 2001 From: americast Date: Fri, 20 Oct 2023 14:42:37 -0400 Subject: [PATCH 02/21] update suggestion handle to bool --- docs/source/reference/ai/model-forecasting.rst | 2 +- evadb/functions/forecast.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index e8f2a615a4..8dcd32aea9 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -39,7 +39,7 @@ This trains a forecasting model. The model can be called by providing the horizo .. note:: - `Forecasting` function also provides suggestions by default. If you wish to turn it off, send "False" as an optional argument while calling the function. Eg. `SELECT Forecast("False");` + `Forecasting` function also provides suggestions by default. If you wish to turn it off, send `FALSE` as an optional argument while calling the function. Eg. `SELECT Forecast(FALSE);` Forecast Parameters ------------------- diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 493739f4f6..4da8c32a18 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -65,7 +65,7 @@ def forward(self, data) -> pd.DataFrame: forecast_df = self.model.predict().reset_index() # Suggestions - if len(data) == 0 or list(data[0])[0].lower()[0] == "t": + if len(data) == 0 or list(data[0])[0] is True: suggestion_list = [] # 1: Flat predictions if self.library == "statsforecast": From c118203eb2f536fa40ba45cff7f2e63fbe442858 Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 24 Oct 2023 00:45:04 -0400 Subject: [PATCH 03/21] wip: add confidence --- evadb/executor/create_function_executor.py | 19 +++++++++- evadb/functions/forecast.py | 44 +++++++++++++++++----- 2 files changed, 53 insertions(+), 10 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index d045205a65..52286ab790 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -336,6 +336,15 @@ def handle_forecasting_function(self): aggregated_batch.rename(columns={arg_map["time"]: "ds"}) if "id" in arg_map.keys(): aggregated_batch.rename(columns={arg_map["id"]: "unique_id"}) + if "conf" in arg_map.keys(): + try: + conf = round(arg_map["conf"]) + except Exception: + err_msg = "Confidence must be a number" + logger.error(err_msg) + raise FunctionIODefinitionError(err_msg) + else: + conf = 90 data = aggregated_batch.frames if "unique_id" not in list(data.columns): @@ -379,6 +388,7 @@ def handle_forecasting_function(self): try_to_import_neuralforecast() from neuralforecast import NeuralForecast from neuralforecast.auto import AutoNBEATS, AutoNHITS + from neuralforecast.losses.pytorch import MQLoss from neuralforecast.models import NBEATS, NHITS model_dict = { @@ -430,6 +440,8 @@ def get_optuna_config(trial): model_args["backend"] = "optuna" model_args["h"] = horizon + if conf > 0: + model_args["loss"] = MQLoss(level=[conf]) model = NeuralForecast( [model_here(**model_args)], @@ -474,7 +486,11 @@ def get_optuna_config(trial): data["ds"] = pd.to_datetime(data["ds"]) - model_save_dir_name = library + "_" + arg_map["model"] + "_" + new_freq + model_save_dir_name = ( + library + "_" + arg_map["model"] + "_" + new_freq + if "statsforecast" in library + else library + "_" + conf + "_" + arg_map["model"] + "_" + new_freq + ) if len(data.columns) >= 4 and library == "neuralforecast": model_save_dir_name += "_exogenous_" + str(sorted(exogenous_columns)) @@ -545,6 +561,7 @@ def get_optuna_config(trial): ), FunctionMetadataCatalogEntry("horizon", horizon), FunctionMetadataCatalogEntry("library", library), + FunctionMetadataCatalogEntry("conf", conf), ] return ( diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 4da8c32a18..8638837d18 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -37,6 +37,7 @@ def setup( id_column_rename: str, horizon: int, library: str, + conf: int, ): self.library = library if "neuralforecast" in self.library: @@ -57,15 +58,21 @@ def setup( self.suggestion_dict = { 1: "Predictions are flat. Consider using LIBRARY 'neuralforecast' for more accrate predictions.", } + self.conf = conf def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": - forecast_df = self.model.predict(h=self.horizon).reset_index() + if self.conf > 0: + forecast_df = self.model.predict( + h=self.horizon, level=[self.conf] + ).reset_index() + else: + forecast_df = self.model.predict(h=self.horizon).reset_index() else: forecast_df = self.model.predict().reset_index() # Suggestions - if len(data) == 0 or list(data[0])[0] is True: + if len(data) == 0 or list(list(data.iloc[0]))[0] is True: suggestion_list = [] # 1: Flat predictions if self.library == "statsforecast": @@ -81,11 +88,30 @@ def forward(self, data) -> pd.DataFrame: for suggestion in set(suggestion_list): print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) - forecast_df = forecast_df.rename( - columns={ - "unique_id": self.id_column_rename, - "ds": self.time_column_rename, - self.model_name: self.predict_column_rename, - } - )[: self.horizon * forecast_df["unique_id"].nunique()] + if self.conf > 0: + forecast_df = forecast_df.rename( + columns={ + "unique_id": self.id_column_rename, + "ds": self.time_column_rename, + self.model_name: self.predict_column_rename, + self.model_name + + "-lo-" + + str(self.conf): self.predict_column_rename + + "-lo-" + + str(self.conf), + self.model_name + + "-hi-" + + str(self.conf): self.predict_column_rename + + "-hi-" + + str(self.conf), + } + )[: self.horizon * forecast_df["unique_id"].nunique()] + else: + forecast_df = forecast_df.rename( + columns={ + "unique_id": self.id_column_rename, + "ds": self.time_column_rename, + self.model_name: self.predict_column_rename, + } + )[: self.horizon * forecast_df["unique_id"].nunique()] return forecast_df From d208bf9c829faf3014cf1948f5e438b1e9c30f20 Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 24 Oct 2023 02:11:32 -0400 Subject: [PATCH 04/21] confidence done --- evadb/binder/statement_binder.py | 49 ++++++++------------- evadb/executor/create_function_executor.py | 3 +- evadb/functions/forecast.py | 50 ++++++++-------------- 3 files changed, 36 insertions(+), 66 deletions(-) diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index 128e6e7eed..1a1501f70b 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -30,11 +30,9 @@ resolve_alias_table_value_expression, ) from evadb.binder.statement_binder_context import StatementBinderContext -from evadb.catalog.catalog_type import ColumnType, TableType +from evadb.catalog.catalog_type import ColumnType, TableType, VideoColumnName from evadb.catalog.catalog_utils import get_metadata_properties, is_document_table -from evadb.catalog.sql_config import RESTRICTED_COL_NAMES from evadb.configuration.constants import EvaDB_INSTALLATION_DIR -from evadb.executor.execution_context import Context from evadb.expression.abstract_expression import AbstractExpression, ExpressionType from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression @@ -100,12 +98,11 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): for column in all_column_list: if column.name in predict_columns: column.name = column.name + "_predictions" + outputs.append(column) else: inputs.append(column) - elif string_comparison_case_insensitive( - node.function_type, "sklearn" - ) or string_comparison_case_insensitive(node.function_type, "XGBoost"): + elif string_comparison_case_insensitive(node.function_type, "sklearn"): assert ( "predict" in arg_map ), f"Creating {node.function_type} functions expects 'predict' metadata." @@ -140,6 +137,7 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): assert ( len(node.inputs) == 0 and len(node.outputs) == 0 ), f"{node.function_type} functions' input and output are auto assigned" + outputs.extend([ColumnDefinition(arg_map.get("predict", "y")+"-lo", ColumnType.INTEGER, None, None), ColumnDefinition(arg_map.get("predict", "y")+"-hi", ColumnType.INTEGER, None, None)]) node.inputs, node.outputs = inputs, outputs @bind.register(SelectStatement) @@ -205,12 +203,6 @@ def _bind_delete_statement(self, node: DeleteTableStatement): @bind.register(CreateTableStatement) def _bind_create_statement(self, node: CreateTableStatement): - # we don't allow certain keywords in the column_names - for col in node.column_list: - assert ( - col.name.lower() not in RESTRICTED_COL_NAMES - ), f"EvaDB does not allow to create a table with column name {col.name}" - if node.query is not None: self.bind(node.query) @@ -268,17 +260,19 @@ def _bind_tableref(self, node: TableRef): @bind.register(TupleValueExpression) def _bind_tuple_expr(self, node: TupleValueExpression): - from evadb.binder.tuple_value_expression_binder import bind_tuple_expr - - bind_tuple_expr(self, node) + table_alias, col_obj = self._binder_context.get_binded_column( + node.name, node.table_alias + ) + node.table_alias = table_alias + if node.name == VideoColumnName.audio: + self._binder_context.enable_audio_retrieval() + if node.name == VideoColumnName.data: + self._binder_context.enable_video_retrieval() + node.col_alias = "{}.{}".format(table_alias, node.name.lower()) + node.col_object = col_obj @bind.register(FunctionExpression) def _bind_func_expr(self, node: FunctionExpression): - # setup the context - # we read the GPUs from the catalog and populate in the context - gpus_ids = self._catalog().get_configuration_catalog_value("gpu_ids") - node._context = Context(gpus_ids) - # handle the special case of "extract_object" if node.name.upper() == str(FunctionType.EXTRACT_OBJECT): handle_bind_extract_object_function(node, self) @@ -346,18 +340,9 @@ def _bind_func_expr(self, node: FunctionExpression): ) # certain functions take additional inputs like yolo needs the model_name # these arguments are passed by the user as part of metadata - # we also handle the special case of ChatGPT where we need to send the - # OpenAPI key as part of the parameter if not provided by the user - properties = get_metadata_properties(function_obj) - if string_comparison_case_insensitive(node.name, "CHATGPT"): - # if the user didn't provide any API_KEY, check if we have one in the catalog - if "OPENAI_API_KEY" not in properties.keys(): - openapi_key = self._catalog().get_configuration_catalog_value( - "OPENAI_API_KEY" - ) - properties["openai_api_key"] = openapi_key - - node.function = lambda: function_class(**properties) + node.function = lambda: function_class( + **get_metadata_properties(function_obj) + ) except Exception as e: err_msg = ( f"{str(e)}. Please verify that the function class name in the " diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 52286ab790..1f056a5e92 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -440,8 +440,7 @@ def get_optuna_config(trial): model_args["backend"] = "optuna" model_args["h"] = horizon - if conf > 0: - model_args["loss"] = MQLoss(level=[conf]) + model_args["loss"] = MQLoss(level=[conf]) model = NeuralForecast( [model_here(**model_args)], diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 8638837d18..7accc552fc 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -62,12 +62,9 @@ def setup( def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": - if self.conf > 0: - forecast_df = self.model.predict( - h=self.horizon, level=[self.conf] - ).reset_index() - else: - forecast_df = self.model.predict(h=self.horizon).reset_index() + forecast_df = self.model.predict( + h=self.horizon, level=[self.conf] + ).reset_index() else: forecast_df = self.model.predict().reset_index() @@ -88,30 +85,19 @@ def forward(self, data) -> pd.DataFrame: for suggestion in set(suggestion_list): print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) - if self.conf > 0: - forecast_df = forecast_df.rename( - columns={ - "unique_id": self.id_column_rename, - "ds": self.time_column_rename, - self.model_name: self.predict_column_rename, - self.model_name - + "-lo-" - + str(self.conf): self.predict_column_rename - + "-lo-" - + str(self.conf), - self.model_name - + "-hi-" - + str(self.conf): self.predict_column_rename - + "-hi-" - + str(self.conf), - } - )[: self.horizon * forecast_df["unique_id"].nunique()] - else: - forecast_df = forecast_df.rename( - columns={ - "unique_id": self.id_column_rename, - "ds": self.time_column_rename, - self.model_name: self.predict_column_rename, - } - )[: self.horizon * forecast_df["unique_id"].nunique()] + forecast_df = forecast_df.rename( + columns={ + "unique_id": self.id_column_rename, + "ds": self.time_column_rename, + self.model_name: self.predict_column_rename, + self.model_name + + "-lo-" + + str(self.conf): self.predict_column_rename + + "-lo", + self.model_name + + "-hi-" + + str(self.conf): self.predict_column_rename + + "-hi", + } + )[: self.horizon * forecast_df["unique_id"].nunique()] return forecast_df From 883c59881b3cc569e61836f50b0ef5f853c75e49 Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 24 Oct 2023 02:16:29 -0400 Subject: [PATCH 05/21] add docs --- .../source/reference/ai/model-forecasting.rst | 2 ++ evadb/binder/statement_binder.py | 19 +++++++++++++++++-- evadb/executor/create_function_executor.py | 7 ++++++- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index 8dcd32aea9..d5c7e41833 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -63,6 +63,8 @@ EvaDB's default forecast framework is `statsforecast `_ to learn details about these models. If LIBRARY is `neuralforecast`, we can select one of NHITS or NBEATS. The default is NBEATS. Check `NBEATS docs `_ for details. * - AUTO (str, default: 'T') - If set to 'T', it enables automatic hyperparameter optimization. Must be set to 'T' for `statsforecast` library. One may set this parameter to `false` if LIBRARY is `neuralforecast` for faster (but less reliable) results. + * - CONF (int, default: 90) + - Sets the confidence interval in percentage for the forecast. Must be a number between 0 and 100. The lower and upper bounds of the confidence interval are returned in two separate columns, named as the PREDICT column with `-lo` and `-hi` suffixes. * - Frequency (str, default: 'auto') - A string indicating the frequency of the data. The common used ones are D, W, M, Y, which respectively represents day-, week-, month- and year- end frequency. The default value is M. Check `pandas available frequencies `_ for all available frequencies. If it is not provided, the frequency is attempted to be determined automatically. diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index 1a1501f70b..0af675e110 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -98,7 +98,7 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): for column in all_column_list: if column.name in predict_columns: column.name = column.name + "_predictions" - + outputs.append(column) else: inputs.append(column) @@ -137,7 +137,22 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): assert ( len(node.inputs) == 0 and len(node.outputs) == 0 ), f"{node.function_type} functions' input and output are auto assigned" - outputs.extend([ColumnDefinition(arg_map.get("predict", "y")+"-lo", ColumnType.INTEGER, None, None), ColumnDefinition(arg_map.get("predict", "y")+"-hi", ColumnType.INTEGER, None, None)]) + outputs.extend( + [ + ColumnDefinition( + arg_map.get("predict", "y") + "-lo", + ColumnType.INTEGER, + None, + None, + ), + ColumnDefinition( + arg_map.get("predict", "y") + "-hi", + ColumnType.INTEGER, + None, + None, + ), + ] + ) node.inputs, node.outputs = inputs, outputs @bind.register(SelectStatement) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 1f056a5e92..c31a1fb8bf 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -340,12 +340,17 @@ def handle_forecasting_function(self): try: conf = round(arg_map["conf"]) except Exception: - err_msg = "Confidence must be a number" + err_msg = "Confidence must be a number." logger.error(err_msg) raise FunctionIODefinitionError(err_msg) else: conf = 90 + if conf > 100: + err_msg = "Confidence must <= 100." + logger.error(err_msg) + raise FunctionIODefinitionError(err_msg) + data = aggregated_batch.frames if "unique_id" not in list(data.columns): data["unique_id"] = [1 for x in range(len(data))] From 4f6ea4a3a5260a4aa6ab8e537a67b3c97fac4c17 Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 24 Oct 2023 10:28:47 -0400 Subject: [PATCH 06/21] fix tests --- evadb/binder/statement_binder.py | 32 ++++++++-------- evadb/executor/create_function_executor.py | 2 +- evadb/functions/forecast.py | 5 ++- .../long/test_model_forecasting.py | 25 +++++++++++-- .../binder/test_statement_binder.py | 37 ++++++++++++++++++- 5 files changed, 77 insertions(+), 24 deletions(-) diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index 0af675e110..1e1faac1f2 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -130,6 +130,22 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): assert ( len(required_columns) == 0 ), f"Missing required {required_columns} columns for forecasting function." + outputs.extend( + [ + ColumnDefinition( + arg_map.get("predict", "y") + "-lo", + ColumnType.FLOAT, + None, + None, + ), + ColumnDefinition( + arg_map.get("predict", "y") + "-hi", + ColumnType.FLOAT, + None, + None, + ), + ] + ) else: raise BinderError( f"Unsupported type of function: {node.function_type}." @@ -137,22 +153,6 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): assert ( len(node.inputs) == 0 and len(node.outputs) == 0 ), f"{node.function_type} functions' input and output are auto assigned" - outputs.extend( - [ - ColumnDefinition( - arg_map.get("predict", "y") + "-lo", - ColumnType.INTEGER, - None, - None, - ), - ColumnDefinition( - arg_map.get("predict", "y") + "-hi", - ColumnType.INTEGER, - None, - None, - ), - ] - ) node.inputs, node.outputs = inputs, outputs @bind.register(SelectStatement) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index c31a1fb8bf..8538b11056 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -493,7 +493,7 @@ def get_optuna_config(trial): model_save_dir_name = ( library + "_" + arg_map["model"] + "_" + new_freq if "statsforecast" in library - else library + "_" + conf + "_" + arg_map["model"] + "_" + new_freq + else library + "_" + str(conf) + "_" + arg_map["model"] + "_" + new_freq ) if len(data.columns) >= 4 and library == "neuralforecast": model_save_dir_name += "_exogenous_" + str(sorted(exogenous_columns)) diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 7accc552fc..56be87dcc9 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -84,12 +84,13 @@ def forward(self, data) -> pd.DataFrame: for suggestion in set(suggestion_list): print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) - forecast_df = forecast_df.rename( columns={ "unique_id": self.id_column_rename, "ds": self.time_column_rename, - self.model_name: self.predict_column_rename, + self.model_name + if self.library == "statsforecast" + else self.model_name + "-median": self.predict_column_rename, self.model_name + "-lo-" + str(self.conf): self.predict_column_rename diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py index 47ffe65a83..76e5562357 100644 --- a/test/integration_tests/long/test_model_forecasting.py +++ b/test/integration_tests/long/test_model_forecasting.py @@ -94,7 +94,14 @@ def test_forecast(self): result = execute_query_fetch_all(self.evadb, predict_query) self.assertEqual(len(result), 12) self.assertEqual( - result.columns, ["airforecast.unique_id", "airforecast.ds", "airforecast.y"] + result.columns, + [ + "airforecast.unique_id", + "airforecast.ds", + "airforecast.y", + "airforecast.y-lo", + "airforecast.y-hi", + ], ) create_predict_udf = """ @@ -116,7 +123,13 @@ def test_forecast(self): self.assertEqual(len(result), 24) self.assertEqual( result.columns, - ["airpanelforecast.unique_id", "airpanelforecast.ds", "airpanelforecast.y"], + [ + "airpanelforecast.unique_id", + "airpanelforecast.ds", + "airpanelforecast.y", + "airpanelforecast.y-lo", + "airpanelforecast.y-hi", + ], ) @forecast_skip_marker @@ -143,7 +156,13 @@ def test_forecast_with_column_rename(self): self.assertEqual(len(result), 24) self.assertEqual( result.columns, - ["homeforecast.type", "homeforecast.saledate", "homeforecast.ma"], + [ + "homeforecast.type", + "homeforecast.saledate", + "homeforecast.ma", + "homeforecast.ma-lo", + "homeforecast.ma-hi", + ], ) diff --git a/test/unit_tests/binder/test_statement_binder.py b/test/unit_tests/binder/test_statement_binder.py index d6642ea9a2..9e593faf23 100644 --- a/test/unit_tests/binder/test_statement_binder.py +++ b/test/unit_tests/binder/test_statement_binder.py @@ -475,6 +475,16 @@ def test_bind_create_function_should_bind_forecast_with_default_columns(self): array_type=MagicMock(), array_dimensions=MagicMock(), ) + y_lo_col_obj = ColumnCatalogEntry( + name="y-lo", + type=ColumnType.FLOAT, + array_type=None, + ) + y_hi_col_obj = ColumnCatalogEntry( + name="y-hi", + type=ColumnType.FLOAT, + array_type=None, + ) create_function_statement.query.target_list = [ TupleValueExpression( name=id_col_obj.name, table_alias="a", col_object=id_col_obj @@ -506,9 +516,16 @@ def test_bind_create_function_should_bind_forecast_with_default_columns(self): col_obj.array_type, col_obj.array_dimensions, ) - for col_obj in (id_col_obj, ds_col_obj, y_col_obj) + for col_obj in ( + id_col_obj, + ds_col_obj, + y_col_obj, + y_lo_col_obj, + y_hi_col_obj, + ) ] ) + print(create_function_statement.outputs) self.assertEqual(create_function_statement.inputs, expected_inputs) self.assertEqual(create_function_statement.outputs, expected_outputs) @@ -534,6 +551,16 @@ def test_bind_create_function_should_bind_forecast_with_renaming_columns(self): array_type=MagicMock(), array_dimensions=MagicMock(), ) + y_lo_col_obj = ColumnCatalogEntry( + name="ma-lo", + type=ColumnType.FLOAT, + array_type=None, + ) + y_hi_col_obj = ColumnCatalogEntry( + name="ma-hi", + type=ColumnType.FLOAT, + array_type=None, + ) create_function_statement.query.target_list = [ TupleValueExpression( name=id_col_obj.name, table_alias="a", col_object=id_col_obj @@ -569,7 +596,13 @@ def test_bind_create_function_should_bind_forecast_with_renaming_columns(self): col_obj.array_type, col_obj.array_dimensions, ) - for col_obj in (id_col_obj, ds_col_obj, y_col_obj) + for col_obj in ( + id_col_obj, + ds_col_obj, + y_col_obj, + y_lo_col_obj, + y_hi_col_obj, + ) ] ) self.assertEqual(create_function_statement.inputs, expected_inputs) From 70088d2c9ab3622f53020f6a0180b5ded1d1f129 Mon Sep 17 00:00:00 2001 From: americast Date: Thu, 26 Oct 2023 20:34:45 -0400 Subject: [PATCH 07/21] metrics for statsforecast --- evadb/executor/create_function_executor.py | 7 ++-- evadb/functions/forecast.py | 32 ++++++++++++++++++- .../binder/test_statement_binder.py | 1 - 3 files changed, 36 insertions(+), 4 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 8538b11056..67dfa8c584 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -485,7 +485,7 @@ def get_optuna_config(trial): raise FunctionIODefinitionError(err_msg) model = StatsForecast( - [model_here(season_length=season_length)], freq=new_freq + [model_here(season_length=season_length)], freq=new_freq, n_jobs=-1 ) data["ds"] = pd.to_datetime(data["ds"]) @@ -550,7 +550,7 @@ def get_optuna_config(trial): model_path = os.path.join(model_dir, existing_model_files[-1]) io_list = self._resolve_function_io(None) - + data["ds"] = data.ds.astype(str) metadata_here = [ FunctionMetadataCatalogEntry("model_name", arg_map["model"]), FunctionMetadataCatalogEntry("model_path", model_path), @@ -566,6 +566,9 @@ def get_optuna_config(trial): FunctionMetadataCatalogEntry("horizon", horizon), FunctionMetadataCatalogEntry("library", library), FunctionMetadataCatalogEntry("conf", conf), + FunctionMetadataCatalogEntry( + "data", data.to_json(path_or_buf=None, orient="split") + ), ] return ( diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 56be87dcc9..0cb1e4af3c 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -16,7 +16,9 @@ import pickle +import numpy as np import pandas as pd +from sklearn.metrics import mean_squared_error from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.functions.decorators.decorators import setup @@ -38,6 +40,7 @@ def setup( horizon: int, library: str, conf: int, + data: pd.DataFrame, ): self.library = library if "neuralforecast" in self.library: @@ -59,6 +62,8 @@ def setup( 1: "Predictions are flat. Consider using LIBRARY 'neuralforecast' for more accrate predictions.", } self.conf = conf + self.training_data = pd.read_json(data, orient="split") + self.training_data.ds = pd.to_datetime(self.training_data.ds) def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": @@ -68,8 +73,9 @@ def forward(self, data) -> pd.DataFrame: else: forecast_df = self.model.predict().reset_index() - # Suggestions + # Feedback if len(data) == 0 or list(list(data.iloc[0]))[0] is True: + # Suggestions suggestion_list = [] # 1: Flat predictions if self.library == "statsforecast": @@ -84,6 +90,30 @@ def forward(self, data) -> pd.DataFrame: for suggestion in set(suggestion_list): print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) + + # Metrics + if self.library == "statsforecast": + crossvalidation_df = self.model.cross_validation( + df=self.training_data[["ds", "y", "unique_id"]], + h=self.horizon, + step_size=24, + n_windows=1, + ).reset_index() + rmses = [] + for uid in crossvalidation_df.unique_id.unique(): + crossvalidation_df_here = crossvalidation_df[ + crossvalidation_df.unique_id == uid + ] + rmses.append( + mean_squared_error( + crossvalidation_df_here.y, + crossvalidation_df_here[self.model_name], + squared=False, + ) + / np.mean(crossvalidation_df_here.y) + ) + print("\nMean normalized RMSE: " + str(np.mean(rmses))) + forecast_df = forecast_df.rename( columns={ "unique_id": self.id_column_rename, diff --git a/test/unit_tests/binder/test_statement_binder.py b/test/unit_tests/binder/test_statement_binder.py index 9e593faf23..3d3ebff3ad 100644 --- a/test/unit_tests/binder/test_statement_binder.py +++ b/test/unit_tests/binder/test_statement_binder.py @@ -525,7 +525,6 @@ def test_bind_create_function_should_bind_forecast_with_default_columns(self): ) ] ) - print(create_function_statement.outputs) self.assertEqual(create_function_statement.inputs, expected_inputs) self.assertEqual(create_function_statement.outputs, expected_outputs) From 619255462f744140a37415524c804c525e1e8d12 Mon Sep 17 00:00:00 2001 From: americast Date: Sun, 29 Oct 2023 05:29:32 -0400 Subject: [PATCH 08/21] neuralforecast rmse --- evadb/executor/create_function_executor.py | 46 ++++++++++++++++++++-- evadb/functions/forecast.py | 29 ++------------ 2 files changed, 45 insertions(+), 30 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 67dfa8c584..d9016dcc73 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -21,7 +21,9 @@ from pathlib import Path from typing import Dict, List +import numpy as np import pandas as pd +from sklearn.metrics import mean_squared_error from evadb.catalog.catalog_utils import get_metadata_properties from evadb.catalog.models.function_catalog import FunctionCatalogEntry @@ -526,6 +528,7 @@ def get_optuna_config(trial): data[column] = data.apply( lambda x: self.convert_to_numeric(x[column]), axis=1 ) + rmses = [] if library == "neuralforecast": cuda_devices_here = "0" if "CUDA_VISIBLE_DEVICES" in os.environ: @@ -534,6 +537,24 @@ def get_optuna_config(trial): with set_env(CUDA_VISIBLE_DEVICES=cuda_devices_here): model.fit(df=data, val_size=horizon) model.save(model_path, overwrite=True) + crossvalidation_df = model.cross_validation( + df=data, val_size=horizon + ) + for uid in crossvalidation_df.unique_id.unique(): + crossvalidation_df_here = crossvalidation_df[ + crossvalidation_df.unique_id == uid + ] + rmses.append( + mean_squared_error( + crossvalidation_df_here.y, + crossvalidation_df_here[arg_map["model"] + "-median"], + squared=False, + ) + / np.mean(crossvalidation_df_here.y) + ) + mean_rmse = np.mean(rmses) + with open(model_path + "_rmse", "w") as f: + f.write(str(mean_rmse) + "\n") else: # The following lines of code helps eliminate the math error encountered in statsforecast when only one datapoint is available in a time series for col in data["unique_id"].unique(): @@ -546,9 +567,29 @@ def get_optuna_config(trial): f = open(model_path, "wb") pickle.dump(model, f) f.close() + crossvalidation_df = model.cross_validation( + df=data[["ds", "y", "unique_id"]], + h=horizon, + step_size=24, + n_windows=1, + ).reset_index() + for uid in crossvalidation_df.unique_id.unique(): + crossvalidation_df_here = crossvalidation_df[ + crossvalidation_df.unique_id == uid + ] + rmses.append( + mean_squared_error( + crossvalidation_df_here.y, + crossvalidation_df_here[arg_map["model"]], + squared=False, + ) + / np.mean(crossvalidation_df_here.y) + ) + mean_rmse = np.mean(rmses) + with open(model_path + "_rmse", "w") as f: + f.write(str(mean_rmse) + "\n") elif not Path(model_path).exists(): model_path = os.path.join(model_dir, existing_model_files[-1]) - io_list = self._resolve_function_io(None) data["ds"] = data.ds.astype(str) metadata_here = [ @@ -566,9 +607,6 @@ def get_optuna_config(trial): FunctionMetadataCatalogEntry("horizon", horizon), FunctionMetadataCatalogEntry("library", library), FunctionMetadataCatalogEntry("conf", conf), - FunctionMetadataCatalogEntry( - "data", data.to_json(path_or_buf=None, orient="split") - ), ] return ( diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 0cb1e4af3c..5062385261 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -16,9 +16,7 @@ import pickle -import numpy as np import pandas as pd -from sklearn.metrics import mean_squared_error from evadb.functions.abstract.abstract_function import AbstractFunction from evadb.functions.decorators.decorators import setup @@ -40,7 +38,6 @@ def setup( horizon: int, library: str, conf: int, - data: pd.DataFrame, ): self.library = library if "neuralforecast" in self.library: @@ -62,8 +59,8 @@ def setup( 1: "Predictions are flat. Consider using LIBRARY 'neuralforecast' for more accrate predictions.", } self.conf = conf - self.training_data = pd.read_json(data, orient="split") - self.training_data.ds = pd.to_datetime(self.training_data.ds) + with open(model_path + "_rmse", "r") as f: + self.rmse = float(f.readline()) def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": @@ -92,27 +89,7 @@ def forward(self, data) -> pd.DataFrame: print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) # Metrics - if self.library == "statsforecast": - crossvalidation_df = self.model.cross_validation( - df=self.training_data[["ds", "y", "unique_id"]], - h=self.horizon, - step_size=24, - n_windows=1, - ).reset_index() - rmses = [] - for uid in crossvalidation_df.unique_id.unique(): - crossvalidation_df_here = crossvalidation_df[ - crossvalidation_df.unique_id == uid - ] - rmses.append( - mean_squared_error( - crossvalidation_df_here.y, - crossvalidation_df_here[self.model_name], - squared=False, - ) - / np.mean(crossvalidation_df_here.y) - ) - print("\nMean normalized RMSE: " + str(np.mean(rmses))) + print("\nMean normalized RMSE: " + str(self.rmse)) forecast_df = forecast_df.rename( columns={ From ed8de98b6f0350403b56bee7cfc2e5c3b9e68e47 Mon Sep 17 00:00:00 2001 From: americast Date: Wed, 1 Nov 2023 12:18:36 -0400 Subject: [PATCH 09/21] reveal hyperparams --- evadb/executor/create_function_executor.py | 6 ++++++ evadb/functions/forecast.py | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index d9016dcc73..b1aef3e027 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -564,6 +564,11 @@ def get_optuna_config(trial): ) model.fit(df=data[["ds", "y", "unique_id"]]) + hypers = "" + if "arima" in arg_map["model"].lower(): + from statsforecast.arima import arima_string + + hypers += arima_string(model.fitted_[0, 0].model_) f = open(model_path, "wb") pickle.dump(model, f) f.close() @@ -588,6 +593,7 @@ def get_optuna_config(trial): mean_rmse = np.mean(rmses) with open(model_path + "_rmse", "w") as f: f.write(str(mean_rmse) + "\n") + f.write(hypers + "\n") elif not Path(model_path).exists(): model_path = os.path.join(model_dir, existing_model_files[-1]) io_list = self._resolve_function_io(None) diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 5062385261..6d9ee1288b 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -59,8 +59,11 @@ def setup( 1: "Predictions are flat. Consider using LIBRARY 'neuralforecast' for more accrate predictions.", } self.conf = conf + self.hypers = None with open(model_path + "_rmse", "r") as f: self.rmse = float(f.readline()) + if "arima" in model_name.lower(): + self.hypers = "p,d,q: " + f.readline() def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": @@ -90,6 +93,8 @@ def forward(self, data) -> pd.DataFrame: # Metrics print("\nMean normalized RMSE: " + str(self.rmse)) + if self.hypers is not None: + print("Hyperparameters: " + self.hypers) forecast_df = forecast_df.rename( columns={ From a83c30d8e965b159da474bb690a1d48a2313425f Mon Sep 17 00:00:00 2001 From: americast Date: Wed, 1 Nov 2023 12:20:58 -0400 Subject: [PATCH 10/21] update docs --- docs/source/reference/ai/model-forecasting.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index d5c7e41833..2f188fde4a 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -3,6 +3,7 @@ Time Series Forecasting ======================== +A Time Series is a series of data points recorded at different time intervals. Time series forecasting involves estimating future values of a time series by analyzing historical data. You can train a forecasting model easily in EvaDB. .. note:: From 01719269f4e512e8beb7799e1af89d362ca4c720 Mon Sep 17 00:00:00 2001 From: americast Date: Thu, 2 Nov 2023 00:52:27 -0400 Subject: [PATCH 11/21] Add more DL models --- evadb/executor/create_function_executor.py | 52 +++++++++++++++++++--- evadb/functions/forecast.py | 14 +++--- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index b1aef3e027..60f8dc522d 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -394,19 +394,55 @@ def handle_forecasting_function(self): if library == "neuralforecast": try_to_import_neuralforecast() from neuralforecast import NeuralForecast - from neuralforecast.auto import AutoNBEATS, AutoNHITS + from neuralforecast.auto import ( + AutoDeepAR, + AutoFEDformer, + AutoInformer, + AutoNBEATS, + AutoNHITS, + AutoPatchTST, + AutoTFT, + AutoTimesNet, + ) + + # from neuralforecast.auto import AutoAutoformer as AutoAFormer from neuralforecast.losses.pytorch import MQLoss - from neuralforecast.models import NBEATS, NHITS + from neuralforecast.models import ( + NBEATS, + NHITS, + TFT, + DeepAR, + FEDformer, + Informer, + PatchTST, + TimesNet, + ) + + # from neuralforecast.models import Autoformer as AFormer model_dict = { "AutoNBEATS": AutoNBEATS, "AutoNHITS": AutoNHITS, "NBEATS": NBEATS, "NHITS": NHITS, + "PatchTST": PatchTST, + "AutoPatchTST": AutoPatchTST, + "DeepAR": DeepAR, + "AutoDeepAR": AutoDeepAR, + "FEDformer": FEDformer, + "AutoFEDformer": AutoFEDformer, + # "AFormer": AFormer, + # "AutoAFormer": AutoAFormer, + "Informer": Informer, + "AutoInformer": AutoInformer, + "TimesNet": TimesNet, + "AutoTimesNet": AutoTimesNet, + "TFT": TFT, + "AutoTFT": AutoTFT, } if "model" not in arg_map.keys(): - arg_map["model"] = "NBEATS" + arg_map["model"] = "TFT" if "auto" not in arg_map.keys() or ( arg_map["auto"].lower()[0] == "t" @@ -440,11 +476,13 @@ def handle_forecasting_function(self): else: model_args_config["hist_exog_list"] = exogenous_columns - def get_optuna_config(trial): - return model_args_config + if "auto" in arg_map["model"].lower(): + + def get_optuna_config(trial): + return model_args_config - model_args["config"] = get_optuna_config - model_args["backend"] = "optuna" + model_args["config"] = get_optuna_config + model_args["backend"] = "optuna" model_args["h"] = horizon model_args["loss"] = MQLoss(level=[conf]) diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 6d9ee1288b..8f682ba3f3 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -14,6 +14,7 @@ # limitations under the License. +import os import pickle import pandas as pd @@ -60,10 +61,12 @@ def setup( } self.conf = conf self.hypers = None - with open(model_path + "_rmse", "r") as f: - self.rmse = float(f.readline()) - if "arima" in model_name.lower(): - self.hypers = "p,d,q: " + f.readline() + self.rmse = None + if os.path.isfile(model_path + "_rmse"): + with open(model_path + "_rmse", "r") as f: + self.rmse = float(f.readline()) + if "arima" in model_name.lower(): + self.hypers = "p,d,q: " + f.readline() def forward(self, data) -> pd.DataFrame: if self.library == "statsforecast": @@ -92,7 +95,8 @@ def forward(self, data) -> pd.DataFrame: print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) # Metrics - print("\nMean normalized RMSE: " + str(self.rmse)) + if self.rmse is not None: + print("\nMean normalized RMSE: " + str(self.rmse)) if self.hypers is not None: print("Hyperparameters: " + self.hypers) From fc48003f26811c9f9994634d33e193446fc22581 Mon Sep 17 00:00:00 2001 From: americast Date: Thu, 2 Nov 2023 20:09:17 -0400 Subject: [PATCH 12/21] metrics optional --- .../source/reference/ai/model-forecasting.rst | 4 +- evadb/executor/create_function_executor.py | 80 ++++++++++--------- 2 files changed, 45 insertions(+), 39 deletions(-) diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index 2f188fde4a..bd095a68fb 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -66,8 +66,10 @@ EvaDB's default forecast framework is `statsforecast `_ for all available frequencies. If it is not provided, the frequency is attempted to be determined automatically. + * - METRICS (str, default: 'True') + - Compute NRMSE by performing cross-validation. It is `False` by default if `LIBRARY` is `neuralforecast` as it can take an extensively long time. Note: If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast`. Otherwise, they would be ignored. diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 60f8dc522d..dc8d0dcc54 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -575,24 +575,27 @@ def get_optuna_config(trial): with set_env(CUDA_VISIBLE_DEVICES=cuda_devices_here): model.fit(df=data, val_size=horizon) model.save(model_path, overwrite=True) - crossvalidation_df = model.cross_validation( - df=data, val_size=horizon - ) - for uid in crossvalidation_df.unique_id.unique(): - crossvalidation_df_here = crossvalidation_df[ - crossvalidation_df.unique_id == uid - ] - rmses.append( - mean_squared_error( - crossvalidation_df_here.y, - crossvalidation_df_here[arg_map["model"] + "-median"], - squared=False, - ) - / np.mean(crossvalidation_df_here.y) + if "metrics" in arg_map and arg_map["metrics"].lower()[0] == "t": + crossvalidation_df = model.cross_validation( + df=data, val_size=horizon ) - mean_rmse = np.mean(rmses) - with open(model_path + "_rmse", "w") as f: - f.write(str(mean_rmse) + "\n") + for uid in crossvalidation_df.unique_id.unique(): + crossvalidation_df_here = crossvalidation_df[ + crossvalidation_df.unique_id == uid + ] + rmses.append( + mean_squared_error( + crossvalidation_df_here.y, + crossvalidation_df_here[ + arg_map["model"] + "-median" + ], + squared=False, + ) + / np.mean(crossvalidation_df_here.y) + ) + mean_rmse = np.mean(rmses) + with open(model_path + "_rmse", "w") as f: + f.write(str(mean_rmse) + "\n") else: # The following lines of code helps eliminate the math error encountered in statsforecast when only one datapoint is available in a time series for col in data["unique_id"].unique(): @@ -610,28 +613,29 @@ def get_optuna_config(trial): f = open(model_path, "wb") pickle.dump(model, f) f.close() - crossvalidation_df = model.cross_validation( - df=data[["ds", "y", "unique_id"]], - h=horizon, - step_size=24, - n_windows=1, - ).reset_index() - for uid in crossvalidation_df.unique_id.unique(): - crossvalidation_df_here = crossvalidation_df[ - crossvalidation_df.unique_id == uid - ] - rmses.append( - mean_squared_error( - crossvalidation_df_here.y, - crossvalidation_df_here[arg_map["model"]], - squared=False, + if "metrics" in arg_map and arg_map["metrics"].lower()[0] == "f": + crossvalidation_df = model.cross_validation( + df=data[["ds", "y", "unique_id"]], + h=horizon, + step_size=24, + n_windows=1, + ).reset_index() + for uid in crossvalidation_df.unique_id.unique(): + crossvalidation_df_here = crossvalidation_df[ + crossvalidation_df.unique_id == uid + ] + rmses.append( + mean_squared_error( + crossvalidation_df_here.y, + crossvalidation_df_here[arg_map["model"]], + squared=False, + ) + / np.mean(crossvalidation_df_here.y) ) - / np.mean(crossvalidation_df_here.y) - ) - mean_rmse = np.mean(rmses) - with open(model_path + "_rmse", "w") as f: - f.write(str(mean_rmse) + "\n") - f.write(hypers + "\n") + mean_rmse = np.mean(rmses) + with open(model_path + "_rmse", "w") as f: + f.write(str(mean_rmse) + "\n") + f.write(hypers + "\n") elif not Path(model_path).exists(): model_path = os.path.join(model_dir, existing_model_files[-1]) io_list = self._resolve_function_io(None) From 56a401a5c989ca76439d475160b851d5a1f100f8 Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 7 Nov 2023 18:09:42 -0500 Subject: [PATCH 13/21] fix binder error --- evadb/binder/statement_binder.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index 89ec980424..edc49677a1 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -29,7 +29,6 @@ from evadb.binder.statement_binder_context import StatementBinderContext from evadb.catalog.catalog_type import ColumnType, TableType from evadb.catalog.catalog_utils import is_document_table -from evadb.catalog.sql_config import RESTRICTED_COL_NAMES from evadb.expression.abstract_expression import AbstractExpression, ExpressionType from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression @@ -266,16 +265,9 @@ def _bind_tableref(self, node: TableRef): @bind.register(TupleValueExpression) def _bind_tuple_expr(self, node: TupleValueExpression): - table_alias, col_obj = self._binder_context.get_binded_column( - node.name, node.table_alias - ) - node.table_alias = table_alias - if node.name == VideoColumnName.audio: - self._binder_context.enable_audio_retrieval() - if node.name == VideoColumnName.data: - self._binder_context.enable_video_retrieval() - node.col_alias = "{}.{}".format(table_alias, node.name.lower()) - node.col_object = col_obj + from evadb.binder.tuple_value_expression_binder import bind_tuple_expr + + bind_tuple_expr(self, node) @bind.register(FunctionExpression) def _bind_func_expr(self, node: FunctionExpression): From 30b0fb1424a9570635b47362fcdada36486f602c Mon Sep 17 00:00:00 2001 From: americast Date: Tue, 7 Nov 2023 18:16:58 -0500 Subject: [PATCH 14/21] update docs --- .circleci/config.yml | 2 +- docs/source/reference/ai/model-forecasting.rst | 15 ++++++++++----- evadb/functions/forecast.py | 9 ++++++--- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index af2c5ea078..50c966fb3d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -480,7 +480,7 @@ jobs: - checkout - run: name: Install EvaDB package from GitHub repo and run tests - no_output_timeout: 30m # 30 minute timeout + no_output_timeout: 40m # 40 minute timeout command: | python -m venv test_evadb source test_evadb/bin/activate diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst index bd095a68fb..589d5b23e7 100644 --- a/docs/source/reference/ai/model-forecasting.rst +++ b/docs/source/reference/ai/model-forecasting.rst @@ -38,9 +38,6 @@ This trains a forecasting model. The model can be called by providing the horizo SELECT Forecast(); -.. note:: - - `Forecasting` function also provides suggestions by default. If you wish to turn it off, send `FALSE` as an optional argument while calling the function. Eg. `SELECT Forecast(FALSE);` Forecast Parameters ------------------- @@ -69,9 +66,17 @@ EvaDB's default forecast framework is `statsforecast `_ for all available frequencies. If it is not provided, the frequency is attempted to be determined automatically. * - METRICS (str, default: 'True') - - Compute NRMSE by performing cross-validation. It is `False` by default if `LIBRARY` is `neuralforecast` as it can take an extensively long time. + - Compute NRMSE by performing cross-validation. It is `False` by default if `LIBRARY` is `neuralforecast` as it can take an extensively long time. The metrics are logged locally. + +.. note:: + + If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast`. Otherwise, they would be ignored. + + +.. note:: + + `Forecasting` function also logs suggestions. Logged information, such as metrics and suggestions, is sent to STDOUT by default. If you wish not to print it, please send `FALSE` as an optional argument while calling the function. Eg. `SELECT Forecast(FALSE);` -Note: If columns other than the ones required as mentioned above are passed while creating the function, they will be treated as exogenous variables if LIBRARY is `neuralforecast`. Otherwise, they would be ignored. Below is an example query specifying the above parameters: diff --git a/evadb/functions/forecast.py b/evadb/functions/forecast.py index 8f682ba3f3..6041e6b499 100644 --- a/evadb/functions/forecast.py +++ b/evadb/functions/forecast.py @@ -69,6 +69,7 @@ def setup( self.hypers = "p,d,q: " + f.readline() def forward(self, data) -> pd.DataFrame: + log_str = "" if self.library == "statsforecast": forecast_df = self.model.predict( h=self.horizon, level=[self.conf] @@ -92,13 +93,15 @@ def forward(self, data) -> pd.DataFrame: suggestion_list.append(1) for suggestion in set(suggestion_list): - print("\nSUGGESTION: " + self.suggestion_dict[suggestion]) + log_str += "\nSUGGESTION: " + self.suggestion_dict[suggestion] # Metrics if self.rmse is not None: - print("\nMean normalized RMSE: " + str(self.rmse)) + log_str += "\nMean normalized RMSE: " + str(self.rmse) if self.hypers is not None: - print("Hyperparameters: " + self.hypers) + log_str += "\nHyperparameters: " + self.hypers + + print(log_str) forecast_df = forecast_df.rename( columns={ From f3f02bebffb8c5f3efb53f2b89e922b07dcc0bcf Mon Sep 17 00:00:00 2001 From: americast Date: Wed, 8 Nov 2023 15:36:52 -0500 Subject: [PATCH 15/21] fix metrics logic --- evadb/executor/create_function_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index ef782b225a..0570f10412 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -619,7 +619,7 @@ def get_optuna_config(trial): f = open(model_path, "wb") pickle.dump(model, f) f.close() - if "metrics" in arg_map and arg_map["metrics"].lower()[0] == "f": + if "metrics" not in arg_map or arg_map["metrics"].lower()[0] == "t": crossvalidation_df = model.cross_validation( df=data[["ds", "y", "unique_id"]], h=horizon, From 3f92594a72ac7f24856e4dfbb1b8025ee4395454 Mon Sep 17 00:00:00 2001 From: americast Date: Fri, 10 Nov 2023 01:54:01 -0500 Subject: [PATCH 16/21] restore column name check in binder --- evadb/binder/statement_binder.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index edc49677a1..c950df9473 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -29,6 +29,7 @@ from evadb.binder.statement_binder_context import StatementBinderContext from evadb.catalog.catalog_type import ColumnType, TableType from evadb.catalog.catalog_utils import is_document_table +from evadb.catalog.sql_config import RESTRICTED_COL_NAMES from evadb.expression.abstract_expression import AbstractExpression, ExpressionType from evadb.expression.function_expression import FunctionExpression from evadb.expression.tuple_value_expression import TupleValueExpression @@ -208,6 +209,12 @@ def _bind_delete_statement(self, node: DeleteTableStatement): @bind.register(CreateTableStatement) def _bind_create_statement(self, node: CreateTableStatement): + # we don't allow certain keywords in the column_names + for col in node.column_list: + assert ( + col.name.lower() not in RESTRICTED_COL_NAMES + ), f"EvaDB does not allow to create a table with column name {col.name}" + if node.query is not None: self.bind(node.query) From 823e3fed8f8e56f5fba0d98a03a983e99016fd95 Mon Sep 17 00:00:00 2001 From: americast Date: Fri, 10 Nov 2023 13:49:59 -0500 Subject: [PATCH 17/21] sklearn converted to numpy --- evadb/executor/create_function_executor.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 6b5fae1de7..773e0b94bc 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -24,7 +24,6 @@ import numpy as np import pandas as pd -from sklearn.metrics import mean_squared_error from evadb.catalog.catalog_utils import get_metadata_properties from evadb.catalog.models.function_catalog import FunctionCatalogEntry @@ -57,6 +56,10 @@ from evadb.utils.logging_manager import logger +def root_mean_squared_error(y_true, y_pred): + return np.sqrt(np.mean(np.square(y_pred - y_true))) + + # From https://stackoverflow.com/a/34333710 @contextlib.contextmanager def set_env(**environ): @@ -602,12 +605,11 @@ def get_optuna_config(trial): crossvalidation_df.unique_id == uid ] rmses.append( - mean_squared_error( + root_mean_squared_error( crossvalidation_df_here.y, crossvalidation_df_here[ arg_map["model"] + "-median" ], - squared=False, ) / np.mean(crossvalidation_df_here.y) ) @@ -643,10 +645,9 @@ def get_optuna_config(trial): crossvalidation_df.unique_id == uid ] rmses.append( - mean_squared_error( + root_mean_squared_error( crossvalidation_df_here.y, crossvalidation_df_here[arg_map["model"]], - squared=False, ) / np.mean(crossvalidation_df_here.y) ) From addd2cfb33a2d523602f55f6efe00b90c8f3665c Mon Sep 17 00:00:00 2001 From: americast Date: Sat, 11 Nov 2023 02:54:02 -0500 Subject: [PATCH 18/21] attempt fix test --- .circleci/config.yml | 2 +- evadb/executor/create_function_executor.py | 6 +----- test/integration_tests/long/test_model_forecasting.py | 3 ++- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 50c966fb3d..af2c5ea078 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -480,7 +480,7 @@ jobs: - checkout - run: name: Install EvaDB package from GitHub repo and run tests - no_output_timeout: 40m # 40 minute timeout + no_output_timeout: 30m # 30 minute timeout command: | python -m venv test_evadb source test_evadb/bin/activate diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py index 773e0b94bc..9488195927 100644 --- a/evadb/executor/create_function_executor.py +++ b/evadb/executor/create_function_executor.py @@ -423,7 +423,6 @@ def handle_forecasting_function(self): AutoNHITS, AutoPatchTST, AutoTFT, - AutoTimesNet, ) # from neuralforecast.auto import AutoAutoformer as AutoAFormer @@ -436,7 +435,6 @@ def handle_forecasting_function(self): FEDformer, Informer, PatchTST, - TimesNet, ) # from neuralforecast.models import Autoformer as AFormer @@ -456,8 +454,6 @@ def handle_forecasting_function(self): # "AutoAFormer": AutoAFormer, "Informer": Informer, "AutoInformer": AutoInformer, - "TimesNet": TimesNet, - "AutoTimesNet": AutoTimesNet, "TFT": TFT, "AutoTFT": AutoTFT, } @@ -546,7 +542,7 @@ def get_optuna_config(trial): raise FunctionIODefinitionError(err_msg) model = StatsForecast( - [model_here(season_length=season_length)], freq=new_freq, n_jobs=-1 + [model_here(season_length=season_length)], freq=new_freq ) data["ds"] = pd.to_datetime(data["ds"]) diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py index 76e5562357..8a99bd90ab 100644 --- a/test/integration_tests/long/test_model_forecasting.py +++ b/test/integration_tests/long/test_model_forecasting.py @@ -79,6 +79,7 @@ def tearDownClass(cls): @forecast_skip_marker def test_forecast(self): + create_predict_udf = """ CREATE FUNCTION AirForecast FROM (SELECT unique_id, ds, y FROM AirData) @@ -131,7 +132,7 @@ def test_forecast(self): "airpanelforecast.y-hi", ], ) - + @forecast_skip_marker def test_forecast_with_column_rename(self): create_predict_udf = """ From 2eb51b2b91cf1be12012cd1aa181ce4a19d38446 Mon Sep 17 00:00:00 2001 From: americast Date: Sat, 11 Nov 2023 14:45:30 -0500 Subject: [PATCH 19/21] remove nf from test --- test/integration_tests/long/test_model_forecasting.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py index 8a99bd90ab..d309700d30 100644 --- a/test/integration_tests/long/test_model_forecasting.py +++ b/test/integration_tests/long/test_model_forecasting.py @@ -79,7 +79,6 @@ def tearDownClass(cls): @forecast_skip_marker def test_forecast(self): - create_predict_udf = """ CREATE FUNCTION AirForecast FROM (SELECT unique_id, ds, y FROM AirData) @@ -111,8 +110,6 @@ def test_forecast(self): TYPE Forecasting HORIZON 12 PREDICT 'y' - LIBRARY 'neuralforecast' - AUTO 'false' FREQUENCY 'M'; """ execute_query_fetch_all(self.evadb, create_predict_udf) @@ -132,7 +129,7 @@ def test_forecast(self): "airpanelforecast.y-hi", ], ) - + @forecast_skip_marker def test_forecast_with_column_rename(self): create_predict_udf = """ From 7955011e4f4cea17a025bd70629a88c59410e2a3 Mon Sep 17 00:00:00 2001 From: Andy Xu Date: Thu, 16 Nov 2023 03:34:08 -0500 Subject: [PATCH 20/21] Skip neuralforecast testcases --- test/integration_tests/long/test_model_forecasting.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/integration_tests/long/test_model_forecasting.py b/test/integration_tests/long/test_model_forecasting.py index d309700d30..c376f7610c 100644 --- a/test/integration_tests/long/test_model_forecasting.py +++ b/test/integration_tests/long/test_model_forecasting.py @@ -104,12 +104,19 @@ def test_forecast(self): ], ) + @pytest.mark.skip( + reason="Neuralforecast intergration test takes too long to complete without GPU." + ) + @forecast_skip_marker + def test_forecast_neuralforecast(self): create_predict_udf = """ CREATE FUNCTION AirPanelForecast FROM (SELECT unique_id, ds, y, trend FROM AirDataPanel) TYPE Forecasting HORIZON 12 PREDICT 'y' + LIBRARY 'neuralforecast' + AUTO 'false' FREQUENCY 'M'; """ execute_query_fetch_all(self.evadb, create_predict_udf) From cfbdf6752dc79b439090d7257f235869a375382f Mon Sep 17 00:00:00 2001 From: Andy Xu Date: Thu, 16 Nov 2023 04:54:51 -0500 Subject: [PATCH 21/21] Fix xgboost --- evadb/binder/statement_binder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/evadb/binder/statement_binder.py b/evadb/binder/statement_binder.py index c950df9473..07e720335d 100644 --- a/evadb/binder/statement_binder.py +++ b/evadb/binder/statement_binder.py @@ -93,7 +93,9 @@ def _bind_create_function_statement(self, node: CreateFunctionStatement): outputs.append(column) else: inputs.append(column) - elif string_comparison_case_insensitive(node.function_type, "sklearn"): + elif string_comparison_case_insensitive( + node.function_type, "sklearn" + ) or string_comparison_case_insensitive(node.function_type, "XGBoost"): assert ( "predict" in arg_map ), f"Creating {node.function_type} functions expects 'predict' metadata."