Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Preprocessing Data is missng ,raise key error. automl self was created data that is not repreat behaviour. #1278

Open
731315163 opened this issue Feb 13, 2024 · 1 comment

Comments

@731315163
Copy link

731315163 commented Feb 13, 2024

= X[self.regressors]
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/frame.py", line 3899, in getitem
indexer = self.columns._get_indexer_strict(key, "columns")[1]
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6115, in _get_indexer_strict
self._raise_if_missing(keyarr, indexer, axis_name)
File "/usr/app/regression/venv/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 6179, in _raise_if_missing
raise KeyError(f"{not_found} not in index")
KeyError: "['index_sin1', 'index_sin6', 'index_cos5', 'index_second_cos', 'index_sin2', 'index_second_sin', 'index_cos4', 'index_minute_cos', 'index_dayofweek_sin', 'index_cos1', 'index_month_cos', 'index_dayofyear_sin', 'index_sin3', 'index_sin5', 'index_cos3', 'index_hour_sin', 'index_cos6', 'index_hour_cos', 'index_month_sin', 'index_minute_sin', 'index_sin4', 'index_cos2', 'index_quarter_sin', 'index_quarter_cos', 'index_dayofyear_cos', 'index_dayofweek_cos'] not in index"

import pandas as pd
from flaml.automl import AutoML, logger_formatter
from flaml.tune.searcher import CFO, BlendSearch, FLOW2, BlendSearchTuner
import numpy as np
from libX import PreprocessingData as data
import pickle
import os.path as path

savepath = data.JoinCurDir("automl.pkl")
datename = "DATE"
openen = "Open"


def train():

    traindata = data.get_rawdata_df(["DATE", "WM2NS", "UNRATE", "Open"])
    traindata.reset_index(inplace=True)
    traindata[datename] = pd.to_datetime(traindata[datename], format="%Y-%m-%d")
    traindata[openen] = pd.to_numeric(traindata[openen])
    traindata[datename] = traindata[datename].asfreq("D")
    traindata.set_index(keys=datename, inplace=True)

    print(traindata.head(3))
    # print(trainx[0:2])
    # print(trainy[0:2])
    automl = AutoML()
    automl_settings = {
        "task": "ts_forecast",
        "time_budget": 60 * 10,
        # "estimator_list": ["prophet", "arima", "sarimax"],
        "log_file_name": "ts_forecast.log",
        "period": 14,
    }

    automl.fit(
        dataframe=traindata,  # a single column of timestamp
        label=openen,  # value for each timestamp
        # time horizon to forecast, e.g., 12 months
       
        # split_type="time",
       
        ensemble=True,
       
        early_stop=True,
      
        # skip_transform=True,
        **automl_settings
    )
    automl.save_best_config(savepath)
    with open(savepath, "wb") as f:
        pickle.dump(automl, f, pickle.HIGHEST_PROTOCOL)
    return automl


def findmodel():
    if path.exists(savepath):
        # At prediction time
        with open(savepath, "rb") as f:
            return pickle.load(f)
    else:
        return train()


datetimetest = pd.DataFrame(
    {
        datename: [1, 2, 3],
        "WM2NS": [20966.057142857142, 20963.77142857143, 20961.485714285714],
        "UNRATE": [3.7, 3.7, 3.7],
    }
)
automl = findmodel()
datetimetest.set_index(datename)
pred = automl.predict(datetimetest)
print(pred)
@thinkall
Copy link
Collaborator

Hi @731315163 , the error message is related to pandas and the data itself. Can you check the index stuff works w/o involving flaml?
Besides, you're using the same savepath for both best config and the automl instance.

# for free to join this conversation on GitHub. Already have an account? # to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants