iterative · mike0sv · Oct 8, 2021 · Sep 30, 2021 · Sep 30, 2021 · Oct 1, 2021
diff --git a/mlem/config.py b/mlem/config.py
@@ -34,6 +34,8 @@ class MlemConfig(BaseSettings):
     )
     AUTOLOAD_EXTS: bool = True
     DEFAULT_BRANCH: str = "main"
+    LOG_LEVEL: str = "INFO"
+    DEBUG: bool = False
 
     @property
     def ADDITIONAL_EXTENSIONS(self) -> List[str]:

diff --git a/mlem/constants.py b/mlem/constants.py
@@ -1 +1,5 @@
 MLEM_DIR = ".mlem"
+
+PREDICT_METHOD_NAME = "predict"
+PREDICT_PROBA_METHOD_NAME = "predict_proba"
+PREDICT_ARG_NAME = "data"
diff --git a/mlem/contrib/catboost.py b/mlem/contrib/catboost.py
@@ -1,15 +1,14 @@
 import os
 import tempfile
-from typing import Any, ClassVar
+from typing import Any, ClassVar, Optional
 
 import catboost
-from catboost import CatBoostClassifier, CatBoostRegressor
+from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor
 from fsspec import AbstractFileSystem
 
 from mlem.core.artifacts import Artifacts
-from mlem.core.dataset_type import UnspecifiedDatasetType
-from mlem.core.model import Argument, ModelHook, ModelIO, ModelType, Signature
-from mlem.core.requirements import LibRequirementsMixin
+from mlem.core.model import ModelHook, ModelIO, ModelType, Signature
+from mlem.core.requirements import InstallableRequirement, Requirements
 
 
 class CatBoostModelIO(ModelIO):
@@ -52,36 +51,62 @@ def _get_model_file_name(self, model):
         return self.regressor_file_name
 
 
-class CatBoostModel(ModelType, ModelHook, LibRequirementsMixin):
+class CatBoostModel(ModelType, ModelHook):
     """
     :class:`mlem.core.model.ModelType` for CatBoost models.
     `.model` attribute is a `catboost.CatBoostClassifier` or `catboost.CatBoostRegressor` instance
     """
 
-    libraries: ClassVar = [catboost]
     type: ClassVar[str] = "catboost"
     io: ModelIO = CatBoostModelIO()
+    model: ClassVar[Optional[CatBoost]]
 
     @classmethod
     def is_object_valid(cls, obj: Any) -> bool:
         return isinstance(obj, (CatBoostClassifier, CatBoostRegressor))
 
     @classmethod
-    def process(cls, obj: Any, **kwargs) -> ModelType:
+    def process(
+        cls, obj: Any, sample_data: Optional[Any] = None, **kwargs
+    ) -> ModelType:
+        model = CatBoostModel(model=obj, methods={})
         methods = {
-            "predict": Signature(
-                name="predict",
-                args=[
-                    Argument(key="data", type=UnspecifiedDatasetType())
-                ],  # TODO: https://github.com/iterative/mlem/issues/21
-                returns=UnspecifiedDatasetType(),
-            )
+            "predict": Signature.from_method(
+                model.predict,
+                auto_infer=sample_data is not None,
+                data=sample_data,
+            ),
+            "catboost_predict": Signature.from_method(
+                obj.predict,
+                auto_infer=sample_data is not None,
+                data=sample_data,
+            ),
         }
         if isinstance(obj, CatBoostClassifier):
-            methods["predict_proba"] = Signature(
-                name="predict_proba",
-                args=[Argument(key="data", type=UnspecifiedDatasetType())],
-                # TODO: https://github.com/iterative/mlem/issues/21
-                returns=UnspecifiedDatasetType(),
+            methods["predict_proba"] = Signature.from_method(
+                model.predict_proba,
+                auto_infer=sample_data is not None,
+                data=sample_data,
+            )
+            methods["catboost_predict_proba"] = Signature.from_method(
+                obj.predict_proba,
+                auto_infer=sample_data is not None,
+                X=sample_data,
             )
-        return CatBoostModel(model=obj, methods=methods)
+        model.methods = methods
+        return model
+
+    def predict(self, data):
+        return self.model.predict(data)
+
+    def predict_proba(self, data):
+        if not isinstance(self.model, CatBoostClassifier):
+            raise ValueError(
+                "Not valid type of model for predict_proba method"
+            )
+        return self.model.predict_proba(data)
+
+    def get_requirements(self) -> Requirements:
+        return super().get_requirements() + InstallableRequirement.from_module(
+            catboost
+        )
diff --git a/mlem/contrib/fastapi.py b/mlem/contrib/fastapi.py
@@ -32,7 +32,7 @@ def _create_handler(
         cls, method_name: str, signature: Signature, executor: Callable
     ):
         serializers = {
-            arg.key: arg.type.get_serializer() for arg in signature.args
+            arg.name: arg.type_.get_serializer() for arg in signature.args
         }
         kwargs = {
             key: (serializer.get_model(), ...)
@@ -47,8 +47,8 @@ def _create_handler(
 
         def handler(model: payload_model):  # type: ignore[valid-type]
             kwargs = {
-                a.key: serializers[a.key].deserialize(
-                    getattr(model, a.key).dict()
+                a.name: serializers[a.name].deserialize(
+                    getattr(model, a.name).dict()
                 )
                 for a in signature.args
             }

diff --git a/mlem/contrib/lightgbm.py b/mlem/contrib/lightgbm.py
@@ -1,21 +1,21 @@
 import os
 import tempfile
-from typing import Any, ClassVar
+from typing import Any, ClassVar, Optional
 
 import lightgbm as lgb
 from fsspec import AbstractFileSystem
 
+from mlem.constants import PREDICT_METHOD_NAME
 from mlem.core.artifacts import Artifacts
 from mlem.core.dataset_type import (
     DatasetAnalyzer,
     DatasetHook,
     DatasetType,
     DatasetWriter,
-    UnspecifiedDatasetType,
 )
 from mlem.core.errors import DeserializationError, SerializationError
 from mlem.core.hooks import IsInstanceHookMixin
-from mlem.core.model import Argument, ModelHook, ModelIO, ModelType, Signature
+from mlem.core.model import ModelHook, ModelIO, ModelType, Signature
 from mlem.core.requirements import (
     InstallableRequirement,
     Requirements,
@@ -100,25 +100,30 @@ class LightGBMModel(ModelType, ModelHook, IsInstanceHookMixin):
     io: ModelIO = LightGBMModelIO()
 
     @classmethod
-    def process(cls, obj: Any, **kwargs) -> ModelType:
-        return LightGBMModel(
-            model=obj,
-            methods={
-                "predict": Signature(
-                    name="_predict",
-                    args=[Argument(key="data", type=UnspecifiedDatasetType())],
-                    returns=UnspecifiedDatasetType(),  # TODO: https://github.com/iterative/mlem/issues/21
-                )
-            },
-        )
-
-    def _predict(self, data):
+    def process(
+        cls, obj: Any, sample_data: Optional[Any] = None, **kwargs
+    ) -> ModelType:
+        gbm_model = LightGBMModel(model=obj, methods={})
+        gbm_model.methods = {
+            PREDICT_METHOD_NAME: Signature.from_method(
+                gbm_model.predict,
+                auto_infer=sample_data is not None,
+                data=sample_data,
+            ),
+            "lightgbm_predict": Signature.from_method(
+                obj.predict, auto_infer=sample_data is None, data=sample_data
+            ),
+        }
+        return gbm_model
+
+    def predict(self, data):
         if isinstance(data, lgb.Dataset):
             data = data.data
         return self.model.predict(data)
 
     def get_requirements(self) -> Requirements:
         return (
-            Requirements.new(InstallableRequirement.from_module(mod=lgb))
+            super().get_requirements()
+            + InstallableRequirement.from_module(mod=lgb)
             + LGB_REQUIREMENT
         )
diff --git a/mlem/contrib/sklearn.py b/mlem/contrib/sklearn.py
@@ -1,11 +1,14 @@
-from typing import Any, ClassVar, Dict
+from typing import Any, ClassVar, Optional
 
 import sklearn
 from sklearn.base import ClassifierMixin, RegressorMixin
 
-from mlem.core.dataset_type import DatasetAnalyzer, UnspecifiedDatasetType
+from mlem.constants import (
+    PREDICT_ARG_NAME,
+    PREDICT_METHOD_NAME,
+    PREDICT_PROBA_METHOD_NAME,
+)
 from mlem.core.model import (
-    Argument,
     ModelHook,
     ModelIO,
     ModelType,
@@ -29,37 +32,28 @@ def is_object_valid(cls, obj: Any) -> bool:
         return isinstance(obj, (RegressorMixin, ClassifierMixin))
 
     @classmethod
-    def process(cls, obj: Any, **kwargs) -> "SklearnModel":
-        test_data = kwargs.get("test_data")
-        method_names = ["predict"]
-        if isinstance(obj, ClassifierMixin):
-            method_names.append("predict_proba")
-        if test_data is None:
-
-            methods: Dict[str, Signature] = {
-                m: Signature(
-                    name=m,
-                    args=[Argument(key="X", type=UnspecifiedDatasetType())],
-                    returns=UnspecifiedDatasetType(),
-                )
-                for m in method_names
-            }
-
-        else:
-            methods = {
-                m: Signature(
-                    name=m,
-                    args=[
-                        Argument(
-                            key="X", type=DatasetAnalyzer.analyze(test_data)
-                        )
-                    ],
-                    returns=DatasetAnalyzer.analyze(
-                        getattr(obj, m)(test_data)
-                    ),
-                )
-                for m in method_names
-            }
+    def process(
+        cls, obj: Any, sample_data: Optional[Any] = None, **kwargs
+    ) -> ModelType:
+        sklearn_predict = Signature.from_method(
+            obj.predict, sample_data is not None, X=sample_data
+        )
+        predict = sklearn_predict.copy()
+        predict.args = [predict.args[0].copy()]
+        predict.args[0].name = PREDICT_ARG_NAME
+        methods = {
+            "sklearn_predict": sklearn_predict,
+            PREDICT_METHOD_NAME: predict,
+        }
+        if hasattr(obj, "predict_proba"):
+            sklearn_predict_proba = Signature.from_method(
+                obj.predict_proba, sample_data is not None, X=sample_data
+            )
+            predict_proba = sklearn_predict_proba.copy()
+            predict_proba.args = [predict_proba.args[0].copy()]
+            predict_proba.args[0].name = PREDICT_ARG_NAME
+            methods["sklearn_predict_proba"] = sklearn_predict_proba
+            methods[PREDICT_PROBA_METHOD_NAME] = predict_proba
 
         return SklearnModel(io=SimplePickleIO(), methods=methods).bind(obj)
 

diff --git a/mlem/contrib/xgboost.py b/mlem/contrib/xgboost.py
@@ -5,17 +5,13 @@
 import xgboost
 from fsspec import AbstractFileSystem
 
+from mlem.constants import PREDICT_METHOD_NAME
 from mlem.contrib.numpy import python_type_from_np_string_repr
 from mlem.core.artifacts import Artifacts
-from mlem.core.dataset_type import (
-    DatasetHook,
-    DatasetType,
-    DatasetWriter,
-    UnspecifiedDatasetType,
-)
+from mlem.core.dataset_type import DatasetHook, DatasetType, DatasetWriter
 from mlem.core.errors import DeserializationError, SerializationError
 from mlem.core.hooks import IsInstanceHookMixin
-from mlem.core.model import Argument, ModelHook, ModelIO, ModelType, Signature
+from mlem.core.model import ModelHook, ModelIO, ModelType, Signature
 from mlem.core.requirements import (
     InstallableRequirement,
     Requirements,
@@ -135,9 +131,7 @@ def load(self, fs: AbstractFileSystem, path):
         return model
 
 
-class XGBoostModel(
-    XGBoostRequirement, ModelType, ModelHook, IsInstanceHookMixin
-):
+class XGBoostModel(ModelType, ModelHook, IsInstanceHookMixin):
     """
     :class:`~.ModelType` implementation for XGBoost models
     """
@@ -148,17 +142,31 @@ class XGBoostModel(
     io: ModelIO = XGBoostModelIO()
 
     @classmethod
-    def process(cls, obj: Any, **kwargs) -> ModelType:
+    def process(
+        cls, obj: Any, sample_data: Optional[Any] = None, **kwargs
+    ) -> ModelType:
+        model = XGBoostModel(model=obj, methods={})
         methods = {
-            "predict": Signature(
-                name="_predict",
-                args=[Argument(key="data", type=UnspecifiedDatasetType())],
-                returns=UnspecifiedDatasetType(),  # TODO: https://github.com/iterative/mlem/issues/21
-            )
+            PREDICT_METHOD_NAME: Signature.from_method(
+                model.predict,
+                auto_infer=sample_data is not None,
+                data=sample_data,
+            ),
+            "xgboost_predict": Signature.from_method(
+                obj.predict, auto_infer=sample_data is None, data=sample_data
+            ),
         }
-        return XGBoostModel(model=obj, methods=methods)
+        model.methods = methods
+        return model
 
-    def _predict(self, data):
+    def predict(self, data):
         if not isinstance(data, xgboost.DMatrix):
             data = xgboost.DMatrix(data)
         return self.model.predict(data)
+
+    def get_requirements(self) -> Requirements:
+        return (
+            super().get_requirements()
+            + InstallableRequirement.from_module(xgboost)
+            + XGB_REQUIREMENT
+        )
diff --git a/mlem/core/dataset_type.py b/mlem/core/dataset_type.py
@@ -84,7 +84,7 @@ class PrimitiveType(DatasetType, DatasetHook):
     DatasetType for int, str, bool, complex and float types
     """
 
-    PRIMITIVES: ClassVar[set] = {int, str, bool, complex, float}
+    PRIMITIVES: ClassVar[set] = {int, str, bool, complex, float, type(None)}
     type: ClassVar[str] = "primitive"
 
     ptype: str
@@ -104,9 +104,6 @@ def to_type(self):
     def deserialize(self, obj):
         return self.to_type(obj)
 
-    # def get_spec(self) -> ArgList:
-    #     return [Field(None, self.to_type, False)]
-
     def serialize(self, instance):
         self.check_type(instance, self.to_type, ValueError)
         return instance

diff --git a/mlem/core/hooks.py b/mlem/core/hooks.py
@@ -190,6 +190,6 @@ def _find_hook(cls, obj) -> Type[Hook[T]]:
                 )
             raise ValueError(
                 f"No suitable {cls.base_hook_class.__name__} for object of type "
-                f"[{type(obj).__name__}]. Registered hooks: {cls.hooks}"
+                f'"{type(obj).__name__}". Registered hooks: {cls.hooks}'
             )
         return max(hooks, key=lambda x: x[0])[1]
diff --git a/mlem/core/meta_io.py b/mlem/core/meta_io.py
@@ -96,7 +96,7 @@ def serialize(
 ):  # pylint: disable=unused-argument # todo remove later
     if not isinstance(obj, MlemObject):
         raise ValueError(f"{type(obj)} is not a subclass of MlemObject")
-    return obj.dict(exclude_unset=True)
+    return obj.dict(exclude_unset=True, exclude_defaults=True)
 
 
 T = TypeVar("T")

diff --git a/mlem/core/metadata.py b/mlem/core/metadata.py
@@ -19,7 +19,7 @@ def get_object_metadata(obj: Any, tmp_sample_data=None) -> MlemMeta:
     try:
         return DatasetMeta.from_data(obj)
     except ValueError:  # TODO need separate analysis exception
-        return ModelMeta.from_obj(obj, test_data=tmp_sample_data)
+        return ModelMeta.from_obj(obj, sample_data=tmp_sample_data)
 
 
 def save(