From 3fa1f55b21bad243298abf1f40173c4f07efdefb Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 2 Apr 2019 13:31:23 +0200 Subject: [PATCH 01/44] =?UTF-8?q?Bump=20version:=200.1.1=20=E2=86=92=200.1?= =?UTF-8?q?.2-dev?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- atm/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/atm/__init__.py b/atm/__init__.py index 6d9914a..1e5023a 100644 --- a/atm/__init__.py +++ b/atm/__init__.py @@ -12,7 +12,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.1' +__version__ = '0.1.2-dev' # this defines which modules will be imported by "from atm import *" __all__ = ['config', 'classifier', 'constants', 'database', 'enter_data', diff --git a/setup.cfg b/setup.cfg index 6b17b69..ab06c92 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1 +current_version = 0.1.2-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? diff --git a/setup.py b/setup.py index d3ae04d..c5a2939 100644 --- a/setup.py +++ b/setup.py @@ -102,6 +102,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/HDI-project/ATM', - version='0.1.1', + version='0.1.2-dev', zip_safe=False, ) From e44f8b6e96252e8bc8a5809a721d490255ed9be3 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 12:22:11 +0200 Subject: [PATCH 02/44] Flask-restless-swagger over ATM --- atm/api.py | 32 ++++++++++++++++++++++++++++++++ atm/cli.py | 11 +++++++++++ setup.py | 11 ++++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 atm/api.py diff --git a/atm/api.py b/atm/api.py new file mode 100644 index 0000000..3dfea9a --- /dev/null +++ b/atm/api.py @@ -0,0 +1,32 @@ +import os + +from flask import Flask +from flask_restless_swagger import SwagAPIManager as APIManager +from flask_sqlalchemy import SQLAlchemy + + +def make_absolute(url): + if str(url).startswith('sqlite:///'): + url = 'sqlite:///' + os.path.abspath(url.database) + + return url + + +def create_app(atm): + app = Flask(__name__) + app.config['DEBUG'] = True + app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(atm.db.engine.url) + db = SQLAlchemy(app) + + # Create the Flask-Restless API manager. + manager = APIManager(app, flask_sqlalchemy_db=db) + + # Create API endpoints, which will be available at /api/ by + # default. Allowed HTTP methods can be specified as well. + + manager.create_api(atm.db.Dataset, methods=['GET', 'POST', 'DELETE']) + manager.create_api(atm.db.Datarun, methods=['GET', 'POST', 'DELETE']) + manager.create_api(atm.db.Hyperpartition, methods=['GET', 'POST', 'DELETE']) + manager.create_api(atm.db.Classifier, methods=['GET', 'POST', 'DELETE']) + + return app diff --git a/atm/cli.py b/atm/cli.py index 76e739b..dd560cd 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -5,6 +5,7 @@ import os import shutil +from atm.api import create_app from atm.config import ( add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql) from atm.models import ATM @@ -26,6 +27,12 @@ def _work(args): ) +def _serve(args): + atm = ATM(**vars(args)) + app = create_app(atm) + app.run() + + def _enter_data(args): atm = ATM(**vars(args)) atm.enter_data() @@ -83,6 +90,10 @@ def _get_parser(): action='store_const', const=False, help="don't save models and metrics at all") + # Server + server = subparsers.add_parser('server', parents=[parent]) + server.set_defaults(action=_serve) + # Make Config make_config = subparsers.add_parser('make_config', parents=[parent]) make_config.set_defaults(action=_make_config) diff --git a/setup.py b/setup.py index c5a2939..ef7929d 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,14 @@ 'sqlalchemy>=1.1.14', ] +api_requires = [ + 'flask>=1.0.2', + 'flask-restless>=0.17.0', + 'flask-sqlalchemy>=2.3.2', + 'flask-restless-swagger-2>=0.0.3', + 'simplejson>=3.16.0', +] + setup_requires = [ 'pytest-runner' ] @@ -87,7 +95,8 @@ ] }, extras_require={ - 'dev': development_requires + tests_require, + 'api': api_requires, + 'dev': api_requires + development_requires + tests_require, 'tests': tests_require, }, include_package_data=True, From be91cfea050b22708f03cb417f90b86c20983716 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 15:05:44 +0200 Subject: [PATCH 03/44] Add host and port options --- atm/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/atm/cli.py b/atm/cli.py index dd560cd..6b33b3c 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -30,7 +30,7 @@ def _work(args): def _serve(args): atm = ATM(**vars(args)) app = create_app(atm) - app.run() + app.run(host=args.host, port=args.port) def _enter_data(args): @@ -93,6 +93,8 @@ def _get_parser(): # Server server = subparsers.add_parser('server', parents=[parent]) server.set_defaults(action=_serve) + server.add_argument('--host', help='IP to listen at') + server.add_argument('--port', help='Port to listen at', type=int) # Make Config make_config = subparsers.add_parser('make_config', parents=[parent]) From 1a69ab59a7f30d88b7f5baad3f0b9f3ba1eae554 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 11 Apr 2019 19:38:10 +0200 Subject: [PATCH 04/44] WIP --- API.md | 389 +++++++++++++++++++++++++++++++++++++++++++++++++++++ atm/api.py | 5 + 2 files changed, 394 insertions(+) create mode 100644 API.md diff --git a/API.md b/API.md new file mode 100644 index 0000000..dc7c12b --- /dev/null +++ b/API.md @@ -0,0 +1,389 @@ +# REST API + +**ATM** comes with the possibility to start a server process that enables interacting with +it via a REST API server that runs over [flask](http://flask.pocoo.org/). + +In this document you will find a briefly explanation how to start it and use it. + +## Starting the REST API Server + +In order to start a REST API server, after installing ATM open a terminal, activate its +virtualenv, and execute this command: + +```bash +atm server +``` + +If you would like to start the server in another port, which by default it's 5000, you can include +the `--port` option to run it at the port that you would like: + +```bash +atm server --port 1234 +``` + +An output similar to this one should apear in the terminal: + +```bash + * Serving Flask app "api.setup" (lazy loading) + * Environment: production + WARNING: Do not use the development server in a production environment. + Use a production WSGI server instead. + * Debug mode: on + * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit) + * Restarting with stat + * Debugger is active! + * Debugger PIN: 150-127-826 +``` + +If you now point your browser at http://127.0.0.1:5000/, you will see the documentation +website that shows information about all the REST operations allowed by the API. + +You can press Ctrl+c at any moment to stop the process, but for now +you can keep it running and head to the next section. + + +## Usage + +For this example we have run `atm enter_data` with the default dataset and `atm worker` in order +to create the classifiers and to populate our database. + +By accessing the http://127.0.0.1:5000/ you will see the [Swagger](https://swagger.io/) +documentation and be able to run examples and calls to the REST API. + +In the following steps we will explain how to use this **API**: + +**ATM** REST API allows you to navigate arround the database and have access the following tables: + +* Dataset +* Datarun +* Hyperpartition +* Classifier + +### Dataset + +In order to retrieve the information stored for a `Dataset`, the available parameters to create +an API call are as follow: + +* class_column (string, optional) +* d_features (integer, optional) +* dataruns (Array[Datarun], optional) +* description (string, optional) +* id (integer, optional) +* k_classes (integer, optional) +* majority (number, optional) +* n_examples (integer, optional) +* name (string, optional) +* size_kb (integer, optional) +* test_path (string, optional) +* train_path (string, optional) + +If you are using `Unix` and you have [CURL](https://curl.haxx.se/) you can run this commands in +a separate terminal, otherwise you can access and visualize the data recived from your browser. + +```bash +curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets' +``` + +This should print an output out of the database similar to this one: + +``` +{ + "num_results": 1, + "objects": [ + { + "class_column": "class", + "d_features": 16, + "dataruns": [ + { + "budget": 100, + "budget_type": "classifier", + "dataset_id": 1, + "deadline": null, + "description": "uniform__uniform", + "end_time": "2019-04-11T17:26:28.781095", + "gridding": 0, + "id": 1, + "k_window": 3, + "metric": "f1", + "priority": 1, + "r_minimum": 2, + "score_target": "cv_judgment_metric", + "selector": "uniform", + "start_time": "2019-04-11T17:25:57.192200", + "status": "complete", + "tuner": "uniform" + } + ], + "description": null, + "id": 1, + "k_classes": 2, + "majority": 0.516666667, + "n_examples": 60, + "name": "pollution_1", + "size_kb": 8, + "test_path": null, + "train_path": "/test/pollution_1.csv" + } + ], + "page": 1, + "total_pages": 1 +} + +``` + +If you would like to recover a certain dataset, we can do so by `id`: + +```bash +curl -X GET "http://127.0.0.1:5000/api/datasets/10" -H "accept: application/json" +``` + +Where `10` is the `id` of our dataset. +If you have the database created from our example, containing only one dataset, the output to this +call should be empty: + +```bash +{} +``` + +If you would like to delete a dataset, you need it's `id` and run: + +```bash +curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets/16' +``` + +Where `16` is the `id` of the dataset. + +### Datarun + +* budget (integer, optional), +* budget_type (string, optional), +* classifiers (Array[Classifier], optional), +* dataset (Dataset, optional), +* dataset_id (integer, optional), +* deadline (string, optional), +* description (string, optional), +* end_time (string, optional), +* gridding (integer, optional), +* hyperpartitions (Array[Hyperpartition], optional), +* id (integer, optional), +* k_window (integer, optional), +* metric (string, optional), +* priority (integer, optional), +* r_minimum (integer, optional), +* score_target (string, optional), +* selector (string, optional), +* start_time (string, optional), +* status (string, optional), +* tuner (string, optional) + +```bash +curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/dataruns' +``` + +This should print an output out of the database similar to this one: + +``` +{ + "num_results": 1, + "objects": [ + { + "budget": 100, + "budget_type": "classifier", + "classifiers": [ + { + "cv_judgment_metric": 0.7120634921, + "cv_judgment_metric_stdev": 0.1153100042, + "datarun_id": 1, + "end_time": "2019-04-11T17:25:57.412273", + "error_message": null, + "host": "83.56.245.36", + "hyperparameter_values_64": "gAN9cQAoWAsAAABuX25laWdoYm9yc3EBY251bXB5LmNvcmUubXVsdGlhcnJheQpzY2FsYXIKcQJjbnVtcHkKZHR5cGUKcQNYAgAAAGk4cQRLAEsBh3EFUnEGKEsDWAEAAAA8cQdOTk5K/////0r/////SwB0cQhiQwgSAAAAAAAAAHEJhnEKUnELWAcAAAB3ZWlnaHRzcQxYCAAAAGRpc3RhbmNlcQ1YCQAAAGFsZ29yaXRobXEOWAUAAABicnV0ZXEPWAYAAABtZXRyaWNxEFgJAAAAbWFuaGF0dGFucRFYBgAAAF9zY2FsZXESiHUu", + "hyperpartition_id": 31, + "id": 1, + "metrics_location": "metrics/pollution_1-fd916442.metric", + "model_location": "models/pollution_1-fd916442.model", + "start_time": "2019-04-11T17:25:57.273278", + "status": "complete", + "test_judgment_metric": 0.9523809524 + }, +... +``` + +If you would like to recover a certain datarun, we can do so by `id`: + +```bash +curl -X GET "http://127.0.0.1:5000/api/dataruns/10" -H "accept: application/json" +``` + +Where `10` is the `id` of our dataset. +If you have the database created from our example, containing only one dataset, the output to this +call should be empty: + +```bash +{} +``` + +If you would like to delete a datarun, you need it's `id` and run: + +```bash +curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/dataruns/16' +``` + +Where `16` is the `id` of the datarun. + +### Hyperpartition + +* categorical_hyperparameters_64 (string, optional), +* classifiers (Array[Classifier], optional), +* constant_hyperparameters_64 (string, optional), +* datarun (Datarun, optional), +* datarun_id (integer, optional), +* id (integer, optional), +* method (string, optional), +* status (string, optional), +* tunable_hyperparameters_64 (string, optional) + +```bash +curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/hyperpartitions' +``` + +This should print an output out of the database similar to this one: + +``` +{ + "num_results": 32, + "objects": [ + { + "categorical_hyperparameters_64": "gANdcQAoWAcAAABwZW5hbHR5cQFYAgAAAGwxcQKGcQNYDQAAAGZpdF9pbnRlcmNlcHRxBIiGcQVlLg==", + "classifiers": [ + { + "cv_judgment_metric": 0E-10, + "cv_judgment_metric_stdev": 0E-10, + "datarun_id": 1, + "end_time": "2019-04-11T17:25:58.591654", + "error_message": null, + "host": "83.56.245.36", + "hyperparameter_values_64": "gAN9cQAoWAEAAABDcQFjbnVtcHkuY29yZS5tdWx0aWFycmF5CnNjYWxhcgpxAmNudW1weQpkdHlwZQpxA1gCAAAAZjhxBEsASwGHcQVScQYoSwNYAQAAADxxB05OTkr/////Sv////9LAHRxCGJDCJx3VDODxC8/cQmGcQpScQtYAwAAAHRvbHEMaAJoBkMIFQYn8/JBj0BxDYZxDlJxD1gHAAAAcGVuYWx0eXEQWAIAAABsMXERWA0AAABmaXRfaW50ZXJjZXB0cRKIWAwAAABjbGFzc193ZWlnaHRxE1gIAAAAYmFsYW5jZWRxFFgGAAAAX3NjYWxlcRWIdS4=", + "hyperpartition_id": 1, + "id": 7, + "metrics_location": "metrics/pollution_1-b2ac0bd8.metric", + "model_location": "models/pollution_1-b2ac0bd8.model", + "start_time": "2019-04-11T17:25:58.476363", + "status": "complete", + "test_judgment_metric": 0E-10 + }, +... +``` + +If you would like to recover a certain hyperpartition, we can do so by `id`: + +```bash +curl -X GET "http://127.0.0.1:5000/api/hyperpartition/10" -H "accept: application/json" +``` + +Where `10` is the `id` of our hyperpartition. + +If you have the database created from our example, containing only one dataset, the output to this +call should be empty: + +```bash +{} +``` + +If you would like to delete a hyperpartition, you need it's `id` and run: + +```bash +curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/hyperpartitions/16' +``` + +Where `16` is the `id` of the hyperpartition. + +### Classifier + +* cv_judgment_metric (number, optional), +* cv_judgment_metric_stdev (number, optional), +* datarun (Datarun, optional), +* datarun_id (integer, optional), +* end_time (string, optional), +* error_message (string, optional), +* host (string, optional), +* hyperparameter_values_64 (string, optional), +* hyperpartition (Hyperpartition, optional), +* hyperpartition_id (integer, optional), +* id (integer, optional), +* metrics_location (string, optional), +* model_location (string, optional), +* start_time (string, optional), +* status (string, optional), +* test_judgment_metric (number, optional) + +```bash +curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/classifiers' +``` + +This should print an output out of the database similar to this one: + +``` +{ + "num_results": 100, + "objects": [ + { + "cv_judgment_metric": 0.7120634921, + "cv_judgment_metric_stdev": 0.1153100042, + "datarun": { + "budget": 100, + "budget_type": "classifier", + "dataset_id": 1, + "deadline": null, + "description": "uniform__uniform", + "end_time": "2019-04-11T17:26:28.781095", + "gridding": 0, + "id": 1, + "k_window": 3, + "metric": "f1", + "priority": 1, + "r_minimum": 2, + "score_target": "cv_judgment_metric", + "selector": "uniform", + "start_time": "2019-04-11T17:25:57.192200", + "status": "complete", + "tuner": "uniform" + }, + "datarun_id": 1, + "end_time": "2019-04-11T17:25:57.412273", + "error_message": null, + "host": "83.56.245.36", + "hyperparameter_values_64": "gAN9cQAoWAsAAABuX25laWdoYm9yc3EBY251bXB5LmNvcmUubXVsdGlhcnJheQpzY2FsYXIKcQJjbnVtcHkKZHR5cGUKcQNYAgAAAGk4cQRLAEsBh3EFUnEGKEsDWAEAAAA8cQdOTk5K/////0r/////SwB0cQhiQwgSAAAAAAAAAHEJhnEKUnELWAcAAAB3ZWlnaHRzcQxYCAAAAGRpc3RhbmNlcQ1YCQAAAGFsZ29yaXRobXEOWAUAAABicnV0ZXEPWAYAAABtZXRyaWNxEFgJAAAAbWFuaGF0dGFucRFYBgAAAF9zY2FsZXESiHUu", + "hyperpartition": { + "categorical_hyperparameters_64": "gANdcQAoWAcAAAB3ZWlnaHRzcQFYCAAAAGRpc3RhbmNlcQKGcQNYCQAAAGFsZ29yaXRobXEEWAUAAABicnV0ZXEFhnEGWAYAAABtZXRyaWNxB1gJAAAAbWFuaGF0dGFucQiGcQllLg==", + "constant_hyperparameters_64": "gANdcQBYBgAAAF9zY2FsZXEBiIZxAmEu", + "datarun_id": 1, + "id": 31, + "method": "knn", + "status": "incomplete", + "tunable_hyperparameters_64": "gANdcQBYCwAAAG5fbmVpZ2hib3JzcQFjYnRiLmh5cGVyX3BhcmFtZXRlcgpJbnRIeXBlclBhcmFtZXRlcgpxAmNidGIuaHlwZXJfcGFyYW1ldGVyClBhcmFtVHlwZXMKcQNLAYVxBFJxBV1xBihLAUsUZYZxB4FxCH1xCShYDAAAAF9wYXJhbV9yYW5nZXEKaAZYBQAAAHJhbmdlcQtdcQwoSwFLFGV1YoZxDWEu" + }, +``` + +If you would like to recover a certain classifier, we can do so by `id`: + +```bash +curl -X GET "http://127.0.0.1:5000/api/classifiers/10" -H "accept: application/json" +``` + +Where `10` is the `id` of our classifier. + +If you have the database created from our example, containing only one dataset, the output to this +call should be empty: + +```bash +{} +``` + +If you would like to delete a classifiers, you need it's `id` and run: + +```bash +curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/classifiers/16' +``` + +Where `16` is the `id` of the classifiers. diff --git a/atm/api.py b/atm/api.py index 3dfea9a..4daaa33 100644 --- a/atm/api.py +++ b/atm/api.py @@ -1,6 +1,7 @@ import os from flask import Flask +from flask import redirect from flask_restless_swagger import SwagAPIManager as APIManager from flask_sqlalchemy import SQLAlchemy @@ -24,6 +25,10 @@ def create_app(atm): # Create API endpoints, which will be available at /api/ by # default. Allowed HTTP methods can be specified as well. + @app.route('/') + def swagger(): + return redirect('/static/swagger/swagger-ui/index.html') + manager.create_api(atm.db.Dataset, methods=['GET', 'POST', 'DELETE']) manager.create_api(atm.db.Datarun, methods=['GET', 'POST', 'DELETE']) manager.create_api(atm.db.Hyperpartition, methods=['GET', 'POST', 'DELETE']) From d50916ae262c9b8a295771a6b980b0d61a478164 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 21:11:32 +0200 Subject: [PATCH 05/44] Update the REST operations and its documentation --- API.md | 417 ++++++++++++++++++++--------------------------------- atm/api.py | 8 +- 2 files changed, 158 insertions(+), 267 deletions(-) diff --git a/API.md b/API.md index dc7c12b..b5fe0c1 100644 --- a/API.md +++ b/API.md @@ -14,13 +14,6 @@ virtualenv, and execute this command: atm server ``` -If you would like to start the server in another port, which by default it's 5000, you can include -the `--port` option to run it at the port that you would like: - -```bash -atm server --port 1234 -``` - An output similar to this one should apear in the terminal: ```bash @@ -35,56 +28,71 @@ An output similar to this one should apear in the terminal: * Debugger PIN: 150-127-826 ``` -If you now point your browser at http://127.0.0.1:5000/, you will see the documentation +After this, the REST server will be listening at the port 5000 of you machine, and if you +point your browser at http://127.0.0.1:5000/, you will see the documentation website that shows information about all the REST operations allowed by the API. -You can press Ctrl+c at any moment to stop the process, but for now +Optionally, the `--port ` can be added to modify the port which the server listents at: + +```bash +atm server --port 1234 +``` + +In order to stop the server you can press Ctrl+c, but for now you can keep it running and head to the next section. -## Usage +## Quickstart -For this example we have run `atm enter_data` with the default dataset and `atm worker` in order -to create the classifiers and to populate our database. +In this section we will briefly show the basic usage of the REST API. -By accessing the http://127.0.0.1:5000/ you will see the [Swagger](https://swagger.io/) -documentation and be able to run examples and calls to the REST API. +For more detailed information about all the operations supported by the API, please point your +browser to http://127.0.0.1:5000/ and explore the examples provided by the +[Swagger](https://swagger.io/) interface. -In the following steps we will explain how to use this **API**: +### 1. Generate some data -**ATM** REST API allows you to navigate arround the database and have access the following tables: +Before proceeding any further, please make sure the have already populated your data by triggering +at least one model tuning process. -* Dataset -* Datarun -* Hyperpartition -* Classifier +An easy way to do this is to follow the quickstart from the ATM [README.md](README.md) file, +which means having run these two commands: -### Dataset +``` +atm enter_data +atm worker +``` -In order to retrieve the information stored for a `Dataset`, the available parameters to create -an API call are as follow: +### 2. REST Models -* class_column (string, optional) -* d_features (integer, optional) -* dataruns (Array[Datarun], optional) -* description (string, optional) -* id (integer, optional) -* k_classes (integer, optional) -* majority (number, optional) -* n_examples (integer, optional) -* name (string, optional) -* size_kb (integer, optional) -* test_path (string, optional) -* train_path (string, optional) +Once the database is populated, you can use the REST API to explore the following 4 models: -If you are using `Unix` and you have [CURL](https://curl.haxx.se/) you can run this commands in -a separate terminal, otherwise you can access and visualize the data recived from your browser. +* Datasets +* Dataruns +* Hyperpartitions +* Classifiers -```bash -curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets' +And these are the operations that can be performed on them: + +### 3. Get all objects from a model + +In order to get all the objects for a single model, you need to make a `GET` request to +`/api/`. + +The output will be a JSON with 4 entries: + +* `num_results`: The number of results found +* `objects`: A list containing a subdocument for each result +* `page`: The current page +* `total_pages`: The number of pages + +For example, you can get all the datasets using: + +``` +GET /api/datasets HTTP/1.1 ``` -This should print an output out of the database similar to this one: +And the output will be: ``` { @@ -100,7 +108,7 @@ This should print an output out of the database similar to this one: "dataset_id": 1, "deadline": null, "description": "uniform__uniform", - "end_time": "2019-04-11T17:26:28.781095", + "end_time": "2019-04-11T20:58:11.346733", "gridding": 0, "id": 1, "k_window": 3, @@ -109,7 +117,7 @@ This should print an output out of the database similar to this one: "r_minimum": 2, "score_target": "cv_judgment_metric", "selector": "uniform", - "start_time": "2019-04-11T17:25:57.192200", + "start_time": "2019-04-11T20:58:02.514514", "status": "complete", "tuner": "uniform" } @@ -122,268 +130,151 @@ This should print an output out of the database similar to this one: "name": "pollution_1", "size_kb": 8, "test_path": null, - "train_path": "/test/pollution_1.csv" + "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" } ], "page": 1, "total_pages": 1 } - ``` -If you would like to recover a certain dataset, we can do so by `id`: +### 4. Get a single object by id -```bash -curl -X GET "http://127.0.0.1:5000/api/datasets/10" -H "accept: application/json" -``` +In order to get one particular objects for a model, you need to make a `GET` request to +`/api//`. -Where `10` is the `id` of our dataset. -If you have the database created from our example, containing only one dataset, the output to this -call should be empty: +The output will be the document representing the corresponding object. -```bash -{} -``` +For example, you can get the dataset with id 1 using: -If you would like to delete a dataset, you need it's `id` and run: - -```bash -curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets/16' ``` - -Where `16` is the `id` of the dataset. - -### Datarun - -* budget (integer, optional), -* budget_type (string, optional), -* classifiers (Array[Classifier], optional), -* dataset (Dataset, optional), -* dataset_id (integer, optional), -* deadline (string, optional), -* description (string, optional), -* end_time (string, optional), -* gridding (integer, optional), -* hyperpartitions (Array[Hyperpartition], optional), -* id (integer, optional), -* k_window (integer, optional), -* metric (string, optional), -* priority (integer, optional), -* r_minimum (integer, optional), -* score_target (string, optional), -* selector (string, optional), -* start_time (string, optional), -* status (string, optional), -* tuner (string, optional) - -```bash -curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/dataruns' +GET /api/datasets/1 HTTP/1.1 ``` -This should print an output out of the database similar to this one: +And the output will be: ``` { - "num_results": 1, - "objects": [ + "class_column": "class", + "d_features": 16, + "dataruns": [ { "budget": 100, "budget_type": "classifier", - "classifiers": [ - { - "cv_judgment_metric": 0.7120634921, - "cv_judgment_metric_stdev": 0.1153100042, - "datarun_id": 1, - "end_time": "2019-04-11T17:25:57.412273", - "error_message": null, - "host": "83.56.245.36", - "hyperparameter_values_64": "gAN9cQAoWAsAAABuX25laWdoYm9yc3EBY251bXB5LmNvcmUubXVsdGlhcnJheQpzY2FsYXIKcQJjbnVtcHkKZHR5cGUKcQNYAgAAAGk4cQRLAEsBh3EFUnEGKEsDWAEAAAA8cQdOTk5K/////0r/////SwB0cQhiQwgSAAAAAAAAAHEJhnEKUnELWAcAAAB3ZWlnaHRzcQxYCAAAAGRpc3RhbmNlcQ1YCQAAAGFsZ29yaXRobXEOWAUAAABicnV0ZXEPWAYAAABtZXRyaWNxEFgJAAAAbWFuaGF0dGFucRFYBgAAAF9zY2FsZXESiHUu", - "hyperpartition_id": 31, - "id": 1, - "metrics_location": "metrics/pollution_1-fd916442.metric", - "model_location": "models/pollution_1-fd916442.model", - "start_time": "2019-04-11T17:25:57.273278", - "status": "complete", - "test_judgment_metric": 0.9523809524 - }, -... + "dataset_id": 1, + "deadline": null, + "description": "uniform__uniform", + "end_time": "2019-04-11T20:58:11.346733", + "gridding": 0, + "id": 1, + "k_window": 3, + "metric": "f1", + "priority": 1, + "r_minimum": 2, + "score_target": "cv_judgment_metric", + "selector": "uniform", + "start_time": "2019-04-11T20:58:02.514514", + "status": "complete", + "tuner": "uniform" + } + ], + "description": null, + "id": 1, + "k_classes": 2, + "majority": 0.516666667, + "n_examples": 60, + "name": "pollution_1", + "size_kb": 8, + "test_path": null, + "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" +} ``` -If you would like to recover a certain datarun, we can do so by `id`: +### 5. Get all the children objects -```bash -curl -X GET "http://127.0.0.1:5000/api/dataruns/10" -H "accept: application/json" -``` +In order to get all the childre objects from one parent object, you need to make a +`GET` request to `/api///`. -Where `10` is the `id` of our dataset. -If you have the database created from our example, containing only one dataset, the output to this -call should be empty: - -```bash -{} -``` +The output will be in the same format as if you had requested all the elements from the +children model, but with the results filtered by the parent one. -If you would like to delete a datarun, you need it's `id` and run: +So, for example, in order to get all the dataruns that use the dataset with id 1, you can use: -```bash -curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/dataruns/16' ``` - -Where `16` is the `id` of the datarun. - -### Hyperpartition - -* categorical_hyperparameters_64 (string, optional), -* classifiers (Array[Classifier], optional), -* constant_hyperparameters_64 (string, optional), -* datarun (Datarun, optional), -* datarun_id (integer, optional), -* id (integer, optional), -* method (string, optional), -* status (string, optional), -* tunable_hyperparameters_64 (string, optional) - -```bash -curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/hyperpartitions' +GET /api/datasets/1/dataruns HTTP/1.1 ``` -This should print an output out of the database similar to this one: +And the output will be (note that some parts have been cut): ``` { - "num_results": 32, + "num_results": 1, "objects": [ { - "categorical_hyperparameters_64": "gANdcQAoWAcAAABwZW5hbHR5cQFYAgAAAGwxcQKGcQNYDQAAAGZpdF9pbnRlcmNlcHRxBIiGcQVlLg==", + "budget": 100, + "budget_type": "classifier", "classifiers": [ { - "cv_judgment_metric": 0E-10, - "cv_judgment_metric_stdev": 0E-10, + "cv_judgment_metric": 0.8444444444, + "cv_judgment_metric_stdev": 0.1507184441, "datarun_id": 1, - "end_time": "2019-04-11T17:25:58.591654", + "end_time": "2019-04-11T20:58:02.600185", "error_message": null, "host": "83.56.245.36", - "hyperparameter_values_64": "gAN9cQAoWAEAAABDcQFjbnVtcHkuY29yZS5tdWx0aWFycmF5CnNjYWxhcgpxAmNudW1weQpkdHlwZQpxA1gCAAAAZjhxBEsASwGHcQVScQYoSwNYAQAAADxxB05OTkr/////Sv////9LAHRxCGJDCJx3VDODxC8/cQmGcQpScQtYAwAAAHRvbHEMaAJoBkMIFQYn8/JBj0BxDYZxDlJxD1gHAAAAcGVuYWx0eXEQWAIAAABsMXERWA0AAABmaXRfaW50ZXJjZXB0cRKIWAwAAABjbGFzc193ZWlnaHRxE1gIAAAAYmFsYW5jZWRxFFgGAAAAX3NjYWxlcRWIdS4=", - "hyperpartition_id": 1, - "id": 7, - "metrics_location": "metrics/pollution_1-b2ac0bd8.metric", - "model_location": "models/pollution_1-b2ac0bd8.model", - "start_time": "2019-04-11T17:25:58.476363", + "hyperparameter_values_64": "gAN9cQAoWAsAAABuX25laWdoYm9yc3EBY251bXB5LmNvcmUubXVsdGlhcnJheQpzY2FsYXIKcQJjbnVtcHkKZHR5cGUKcQNYAgAAAGk4cQRLAEsBh3EFUnEGKEsDWAEAAAA8cQdOTk5K/////0r/////SwB0cQhiQwgPAAAAAAAAAHEJhnEKUnELWAkAAABsZWFmX3NpemVxDGgCaAZDCCsAAAAAAAAAcQ2GcQ5ScQ9YBwAAAHdlaWdodHNxEFgIAAAAZGlzdGFuY2VxEVgJAAAAYWxnb3JpdGhtcRJYCQAAAGJhbGxfdHJlZXETWAYAAABtZXRyaWNxFFgJAAAAbWFuaGF0dGFucRVYBgAAAF9zY2FsZXEWiHUu", + "hyperpartition_id": 23, + "id": 1, + "metrics_location": "metrics/pollution_1-4bc39b14.metric", + "model_location": "models/pollution_1-4bc39b14.model", + "start_time": "2019-04-11T20:58:02.539046", "status": "complete", - "test_judgment_metric": 0E-10 + "test_judgment_metric": 0.6250000000 }, -... -``` - -If you would like to recover a certain hyperpartition, we can do so by `id`: - -```bash -curl -X GET "http://127.0.0.1:5000/api/hyperpartition/10" -H "accept: application/json" -``` - -Where `10` is the `id` of our hyperpartition. - -If you have the database created from our example, containing only one dataset, the output to this -call should be empty: - -```bash -{} -``` - -If you would like to delete a hyperpartition, you need it's `id` and run: - -```bash -curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/hyperpartitions/16' -``` - -Where `16` is the `id` of the hyperpartition. - -### Classifier - -* cv_judgment_metric (number, optional), -* cv_judgment_metric_stdev (number, optional), -* datarun (Datarun, optional), -* datarun_id (integer, optional), -* end_time (string, optional), -* error_message (string, optional), -* host (string, optional), -* hyperparameter_values_64 (string, optional), -* hyperpartition (Hyperpartition, optional), -* hyperpartition_id (integer, optional), -* id (integer, optional), -* metrics_location (string, optional), -* model_location (string, optional), -* start_time (string, optional), -* status (string, optional), -* test_judgment_metric (number, optional) - -```bash -curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/classifiers' -``` - -This should print an output out of the database similar to this one: - -``` -{ - "num_results": 100, - "objects": [ - { - "cv_judgment_metric": 0.7120634921, - "cv_judgment_metric_stdev": 0.1153100042, - "datarun": { - "budget": 100, - "budget_type": "classifier", - "dataset_id": 1, - "deadline": null, - "description": "uniform__uniform", - "end_time": "2019-04-11T17:26:28.781095", - "gridding": 0, + ... + ], + "dataset": { + "class_column": "class", + "d_features": 16, + "description": null, "id": 1, - "k_window": 3, - "metric": "f1", - "priority": 1, - "r_minimum": 2, - "score_target": "cv_judgment_metric", - "selector": "uniform", - "start_time": "2019-04-11T17:25:57.192200", - "status": "complete", - "tuner": "uniform" - }, - "datarun_id": 1, - "end_time": "2019-04-11T17:25:57.412273", - "error_message": null, - "host": "83.56.245.36", - "hyperparameter_values_64": "gAN9cQAoWAsAAABuX25laWdoYm9yc3EBY251bXB5LmNvcmUubXVsdGlhcnJheQpzY2FsYXIKcQJjbnVtcHkKZHR5cGUKcQNYAgAAAGk4cQRLAEsBh3EFUnEGKEsDWAEAAAA8cQdOTk5K/////0r/////SwB0cQhiQwgSAAAAAAAAAHEJhnEKUnELWAcAAAB3ZWlnaHRzcQxYCAAAAGRpc3RhbmNlcQ1YCQAAAGFsZ29yaXRobXEOWAUAAABicnV0ZXEPWAYAAABtZXRyaWNxEFgJAAAAbWFuaGF0dGFucRFYBgAAAF9zY2FsZXESiHUu", - "hyperpartition": { - "categorical_hyperparameters_64": "gANdcQAoWAcAAAB3ZWlnaHRzcQFYCAAAAGRpc3RhbmNlcQKGcQNYCQAAAGFsZ29yaXRobXEEWAUAAABicnV0ZXEFhnEGWAYAAABtZXRyaWNxB1gJAAAAbWFuaGF0dGFucQiGcQllLg==", - "constant_hyperparameters_64": "gANdcQBYBgAAAF9zY2FsZXEBiIZxAmEu", - "datarun_id": 1, - "id": 31, - "method": "knn", - "status": "incomplete", - "tunable_hyperparameters_64": "gANdcQBYCwAAAG5fbmVpZ2hib3JzcQFjYnRiLmh5cGVyX3BhcmFtZXRlcgpJbnRIeXBlclBhcmFtZXRlcgpxAmNidGIuaHlwZXJfcGFyYW1ldGVyClBhcmFtVHlwZXMKcQNLAYVxBFJxBV1xBihLAUsUZYZxB4FxCH1xCShYDAAAAF9wYXJhbV9yYW5nZXEKaAZYBQAAAHJhbmdlcQtdcQwoSwFLFGV1YoZxDWEu" + "k_classes": 2, + "majority": 0.516666667, + "n_examples": 60, + "name": "pollution_1", + "size_kb": 8, + "test_path": null, + "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" }, + "dataset_id": 1, + "deadline": null, + "description": "uniform__uniform", + "end_time": "2019-04-11T20:58:11.346733", + "gridding": 0, + "hyperpartitions": [ + { + "categorical_hyperparameters_64": "gANdcQAoWAcAAABwZW5hbHR5cQFYAgAAAGwxcQKGcQNYDQAAAGZpdF9pbnRlcmNlcHRxBIiGcQVlLg==", + "constant_hyperparameters_64": "gANdcQAoWAwAAABjbGFzc193ZWlnaHRxAVgIAAAAYmFsYW5jZWRxAoZxA1gGAAAAX3NjYWxlcQSIhnEFZS4=", + "datarun_id": 1, + "id": 1, + "method": "logreg", + "status": "incomplete", + "tunable_hyperparameters_64": "gANdcQAoWAEAAABDcQFjYnRiLmh5cGVyX3BhcmFtZXRlcgpGbG9hdEV4cEh5cGVyUGFyYW1ldGVyCnECY2J0Yi5oeXBlcl9wYXJhbWV0ZXIKUGFyYW1UeXBlcwpxA0sFhXEEUnEFXXEGKEc+5Pi1iONo8UdA+GoAAAAAAGWGcQeBcQh9cQkoWAwAAABfcGFyYW1fcmFuZ2VxCmgGWAUAAAByYW5nZXELXXEMKEfAFAAAAAAAAEdAFAAAAAAAAGV1YoZxDVgDAAAAdG9scQ5oAmgFXXEPKEc+5Pi1iONo8UdA+GoAAAAAAGWGcRCBcRF9cRIoaApoD2gLXXETKEfAFAAAAAAAAEdAFAAAAAAAAGV1YoZxFGUu" + }, + ... + ], + "id": 1, + "k_window": 3, + "metric": "f1", + "priority": 1, + "r_minimum": 2, + "score_target": "cv_judgment_metric", + "selector": "uniform", + "start_time": "2019-04-11T20:58:02.514514", + "status": "complete", + "tuner": "uniform" + } + ], + "page": 1, + "total_pages": 1 +} ``` - -If you would like to recover a certain classifier, we can do so by `id`: - -```bash -curl -X GET "http://127.0.0.1:5000/api/classifiers/10" -H "accept: application/json" -``` - -Where `10` is the `id` of our classifier. - -If you have the database created from our example, containing only one dataset, the output to this -call should be empty: - -```bash -{} -``` - -If you would like to delete a classifiers, you need it's `id` and run: - -```bash -curl -X DELETE --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/classifiers/16' -``` - -Where `16` is the `id` of the classifiers. diff --git a/atm/api.py b/atm/api.py index 4daaa33..3f91ed8 100644 --- a/atm/api.py +++ b/atm/api.py @@ -29,9 +29,9 @@ def create_app(atm): def swagger(): return redirect('/static/swagger/swagger-ui/index.html') - manager.create_api(atm.db.Dataset, methods=['GET', 'POST', 'DELETE']) - manager.create_api(atm.db.Datarun, methods=['GET', 'POST', 'DELETE']) - manager.create_api(atm.db.Hyperpartition, methods=['GET', 'POST', 'DELETE']) - manager.create_api(atm.db.Classifier, methods=['GET', 'POST', 'DELETE']) + manager.create_api(atm.db.Dataset, methods=['GET']) + manager.create_api(atm.db.Datarun, methods=['GET']) + manager.create_api(atm.db.Hyperpartition, methods=['GET']) + manager.create_api(atm.db.Classifier, methods=['GET']) return app From 2e1de81dc9314f23c71f95a9e4fed45a6262af34 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 21:17:40 +0200 Subject: [PATCH 06/44] Remove personal paths --- API.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/API.md b/API.md index b5fe0c1..c679dc9 100644 --- a/API.md +++ b/API.md @@ -130,7 +130,7 @@ And the output will be: "name": "pollution_1", "size_kb": 8, "test_path": null, - "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" + "train_path": "/path/to/atm/data/test/pollution_1.csv" } ], "page": 1, @@ -186,7 +186,7 @@ And the output will be: "name": "pollution_1", "size_kb": 8, "test_path": null, - "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" + "train_path": "/path/to/atm/data/test/pollution_1.csv" } ``` @@ -243,7 +243,7 @@ And the output will be (note that some parts have been cut): "name": "pollution_1", "size_kb": 8, "test_path": null, - "train_path": "/home/xals/Projects/Pythia/MIT/ATM-csala/atm/data/test/pollution_1.csv" + "train_path": "/path/to/atm/data/test/pollution_1.csv" }, "dataset_id": 1, "deadline": null, From 676a8cffb70599e0c395f5987520958b4fcc18cd Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 21:31:55 +0200 Subject: [PATCH 07/44] Fix import order --- atm/api.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/atm/api.py b/atm/api.py index 3f91ed8..9d8d8b3 100644 --- a/atm/api.py +++ b/atm/api.py @@ -1,7 +1,6 @@ import os -from flask import Flask -from flask import redirect +from flask import Flask, redirect from flask_restless_swagger import SwagAPIManager as APIManager from flask_sqlalchemy import SQLAlchemy From a0d09f02a8f6af83f52231cfe4f02e545709f1b0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 11 Apr 2019 22:01:22 +0200 Subject: [PATCH 08/44] Add link to API.md on the README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 18464c1..7b9d2ea 100644 --- a/README.md +++ b/README.md @@ -311,6 +311,15 @@ database for its datarun! --aws-config config/aws.yaml \ ``` + +## REST API Server + +**ATM** comes with the possibility to start a server process that enables interacting with +the ModelHub Database via a REST API server that runs over [flask](http://flask.pocoo.org/). + +For more details about how to start and use this REST API please check the [API.md](API.md) document. + + From 0121da3e338e118816e90c80fd35773401d46230 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 25 Apr 2019 16:35:46 +0200 Subject: [PATCH 09/44] Add travis.yaml, change docs structure. --- .travis.yml | 27 ++ docs/Makefile | 229 +-------------- docs/{source => }/add_method.rst | 0 docs/{source => }/add_to_btb.rst | 0 docs/conf.py | 171 +++++++++++ docs/{source => }/contributing.rst | 0 docs/{source => }/database.rst | 0 docs/{source => }/index.rst | 0 docs/{source => }/introduction.rst | 0 docs/{source => }/quickstart.rst | 0 docs/{source => }/setup.rst | 0 docs/source/conf.py | 437 ----------------------------- docs/{source => }/tutorial.rst | 0 setup.cfg | 6 +- tox.ini | 9 +- 15 files changed, 218 insertions(+), 661 deletions(-) create mode 100644 .travis.yml rename docs/{source => }/add_method.rst (100%) rename docs/{source => }/add_to_btb.rst (100%) create mode 100644 docs/conf.py rename docs/{source => }/contributing.rst (100%) rename docs/{source => }/database.rst (100%) rename docs/{source => }/index.rst (100%) rename docs/{source => }/introduction.rst (100%) rename docs/{source => }/quickstart.rst (100%) rename docs/{source => }/setup.rst (100%) delete mode 100644 docs/source/conf.py rename docs/{source => }/tutorial.rst (100%) diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..f3df730 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,27 @@ +# Config file for automatic testing at travis-ci.org +language: python +dist: xenial +python: + - 2.7 + - 3.5 + - 3.6 + +# Command to install dependencies +install: pip install -U tox-travis codecov + +# Command to run tests +script: tox + +after_success: codecov + +deploy: + + - provider: pages + skip-cleanup: true + github-token: "$GITHUB_TOKEN" + keep-history: true + local-dir: docs/_build/html + target-branch: gh-pages + on: + branch: master + python: 3.6 diff --git a/docs/Makefile b/docs/Makefile index 330f546..4e63b04 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,225 +1,20 @@ -# Makefile for Sphinx documentation +# Minimal makefile for Sphinx documentation # # You can set these variables from the command line. SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = ./build +SPHINXBUILD = python -msphinx +SPHINXPROJ = stegdetect +SOURCEDIR = . +BUILDDIR = _build -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help +# Put it first so that "make" without argument is like "make help". help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " applehelp to make an Apple Help Book" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " epub3 to make an epub3" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - @echo " coverage to run coverage check of the documentation (if enabled)" - @echo " dummy to check syntax errors of document sources" - -.PHONY: clean -clean: - rm -rf $(BUILDDIR) - -.PHONY: html -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR) - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)" - -.PHONY: dirhtml -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -.PHONY: singlehtml -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -.PHONY: pickle -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -.PHONY: json -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -.PHONY: htmlhelp -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -.PHONY: qthelp -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/atm.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/atm.qhc" - -.PHONY: applehelp -applehelp: - $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp - @echo - @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." - @echo "N.B. You won't be able to view it unless you put it in" \ - "~/Library/Documentation/Help or install it in your application" \ - "bundle." - -.PHONY: devhelp -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/atm" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/atm" - @echo "# devhelp" - -.PHONY: epub -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -.PHONY: epub3 -epub3: - $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 - @echo - @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." - -.PHONY: latex -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -.PHONY: latexpdf -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: latexpdfja -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -.PHONY: text -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -.PHONY: man -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -.PHONY: texinfo -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -.PHONY: info -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -.PHONY: gettext -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -.PHONY: changes -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -.PHONY: linkcheck -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -.PHONY: doctest -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -.PHONY: coverage -coverage: - $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage - @echo "Testing of coverage in the sources finished, look at the " \ - "results in $(BUILDDIR)/coverage/python.txt." - -.PHONY: xml -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -.PHONY: pseudoxml -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." +.PHONY: help Makefile -.PHONY: dummy -dummy: - $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy - @echo - @echo "Build finished. Dummy builder generates no files." +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/add_method.rst b/docs/add_method.rst similarity index 100% rename from docs/source/add_method.rst rename to docs/add_method.rst diff --git a/docs/source/add_to_btb.rst b/docs/add_to_btb.rst similarity index 100% rename from docs/source/add_to_btb.rst rename to docs/add_to_btb.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..a69888c --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,171 @@ +# -*- coding: utf-8 -*- +# +# atm documentation build configuration file, created by +# sphinx-quickstart on Fri Jan 6 13:06:48 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import sphinx_rtd_theme # For read the docs theme + +import atm + +# -- General configuration --------------------------------------------- + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. +extensions = [ + 'm2r', + 'sphinx.ext.autodoc', + 'sphinx.ext.githubpages', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + + + +# General information about the project. +project = 'ATM' +slug = 'atm' +title = project + ' Documentation' +copyright = '2019, MIT Data to AI Lab' +author = 'Thomas Swearingen, Kalyan Veeramachaneni, Bennett Cyphers' +description = 'ATM: Auto Tune Models' +user = 'HDI-project' + +# The version info for the project you're documenting, acts as replacement +# for |version| and |release|, also used in various other places throughout +# the built documents. +# +# The short X.Y version. +version = atm.__version__ +# The full version, including alpha/beta/rc tags. +release = atm.__version__ + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['.py', '_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Readthedocs additions +html_context = { + 'display_github': True, + 'github_user': user, + 'github_repo': project, + 'github_version': 'master', + 'conf_py_path': '/docs/', +} + +# Theme options are theme-specific and customize the look and feel of a +# theme further. For a list of options available for each theme, see the +# documentation. +html_theme_options = { + 'collapse_navigation': False, + 'display_version': False, +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +html_favicon = 'images/favicon.ico' + +# -- Options for HTMLHelp output --------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = slug + 'doc' + + +# -- Options for LaTeX output ------------------------------------------ + +latex_elements = { +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, author, documentclass +# [howto, manual, or own class]). +latex_documents = [( + master_doc, + slug + '.tex', + title, + author, + 'manual' +)] + + +# -- Options for manual page output ------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [( + master_doc, + slug, + title, + [author], + 1 +)] + + +# -- Options for Texinfo output ---------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [( + master_doc, + slug, + title, + author, + slug, + description, + 'Miscellaneous' +)] diff --git a/docs/source/contributing.rst b/docs/contributing.rst similarity index 100% rename from docs/source/contributing.rst rename to docs/contributing.rst diff --git a/docs/source/database.rst b/docs/database.rst similarity index 100% rename from docs/source/database.rst rename to docs/database.rst diff --git a/docs/source/index.rst b/docs/index.rst similarity index 100% rename from docs/source/index.rst rename to docs/index.rst diff --git a/docs/source/introduction.rst b/docs/introduction.rst similarity index 100% rename from docs/source/introduction.rst rename to docs/introduction.rst diff --git a/docs/source/quickstart.rst b/docs/quickstart.rst similarity index 100% rename from docs/source/quickstart.rst rename to docs/quickstart.rst diff --git a/docs/source/setup.rst b/docs/setup.rst similarity index 100% rename from docs/source/setup.rst rename to docs/setup.rst diff --git a/docs/source/conf.py b/docs/source/conf.py deleted file mode 100644 index f81232f..0000000 --- a/docs/source/conf.py +++ /dev/null @@ -1,437 +0,0 @@ -# -*- coding: utf-8 -*- -# -# atm documentation build configuration file, created by -# sphinx-quickstart on Fri Jan 6 13:06:48 2017. -# -# This file is execfile()d with the current directory set to its -# containing dir. -# -# Note that not all possible configuration values are present in this -# autogenerated file. -# -# All configuration values have a default; values that are commented out -# serve to show the default. - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. - -import os -import sys -import sphinx_rtd_theme # For read the docs theme -sys.path.insert(0, os.path.abspath('.')) -sys.path.insert(0, os.path.abspath('..')) -sys.path.insert(0, os.path.abspath('../..')) - - -# -- General configuration ------------------------------------------------ - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.mathjax', - 'sphinx.ext.githubpages', -] - -autosummary_generate = True - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The encoding of source files. -# -# source_encoding = 'utf-8-sig' - -# The master toctree document. -master_doc = 'index' - -# General information about the project. -project = u'atm' -copyright = u'MIT Data to AI Lab' -author = u'Thomas Swearingen, Kalyan Veeramachaneni, Bennett Cyphers' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -version = u'0.0.1' -# The full version, including alpha/beta/rc tags. -release = u'0.0.1' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# -# today = '' -# -# Else, today_fmt is used as the format for a strftime call. -# -# today_fmt = '%B %d, %Y' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This patterns also effect to html_static_path and html_extra_path -exclude_patterns = [] - -# The reST default role (used for this markup: `text`) to use for all -# documents. -# -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# -# show_authors = False - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = False - - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = 'nature' - -#### FOR READ THE DOCS THEME (optional) -html_theme = "sphinx_rtd_theme" -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. -# " v documentation" by default. -# -# html_title = u'atm v0.9' - -# A shorter title for the navigation bar. Default is the same as html_title. -# -# html_short_title = None - -# The name of an image file (relative to this directory) to place at the top -# of the sidebar. -# -# html_logo = None - -# The name of an image file (relative to this directory) to use as a favicon of -# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 -# pixels large. -# -# html_favicon = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] - -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# -# html_extra_path = [] - -# If not None, a 'Last updated on:' timestamp is inserted at every page -# bottom, using the given strftime format. -# The empty string is equivalent to '%b %d, %Y'. -# -# html_last_updated_fmt = None - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# -# html_additional_pages = {} - -# If false, no module index is generated. -# -# html_domain_indices = True - -# If false, no index is generated. -# -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - -# Language to be used for generating the HTML full-text search index. -# Sphinx supports the following languages: -# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' -# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr', 'zh' -# -# html_search_language = 'en' - -# A dictionary with options for the search language support, empty by default. -# 'ja' uses this config value. -# 'zh' user can custom change `jieba` dictionary path. -# -# html_search_options = {'type': 'default'} - -# The name of a javascript file (relative to the configuration directory) that -# implements a search results scorer. If empty, the default will be used. -# -# html_search_scorer = 'scorer.js' - -# Output file base name for HTML help builder. -htmlhelp_basename = 'atmdoc' - -# -- Options for LaTeX output --------------------------------------------- - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'atm.tex', u'ATM Documentation', - author, 'manual'), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# -# latex_use_parts = False - -# If true, show page references after internal links. -# -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# -# latex_appendices = [] - -# It false, will not define \strong, \code, itleref, \crossref ... but only -# \sphinxstrong, ..., \sphinxtitleref, ... To help avoid clash with user added -# packages. -# -# latex_keep_old_macro_names = True - -# If false, no module index is generated. -# -# latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'atm', u'ATM Documentation', - author.split(','), 1) -] - -# If true, show URL addresses after external links. -# -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'atm', u'ATM Documentation', - author, 'atm', 'One line description of project.', - 'Miscellaneous'), -] - -# Documents to append as an appendix to all manuals. -# -# texinfo_appendices = [] - -# If false, no module index is generated. -# -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# -# texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -# -# texinfo_no_detailmenu = False - - -# -- Options for Epub output ---------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project -epub_author = author -epub_publisher = author -epub_copyright = copyright - -# The basename for the epub file. It defaults to the project name. -# epub_basename = project - -# The HTML theme for the epub output. Since the default themes are not -# optimized for small screen space, using the same theme for HTML and epub -# output is usually not wise. This defaults to 'epub', a theme designed to save -# visual space. -# -# epub_theme = 'epub' - -# The language of the text. It defaults to the language option -# or 'en' if the language is not set. -# -# epub_language = '' - -# The scheme of the identifier. Typical schemes are ISBN or URL. -# epub_scheme = '' - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - -# A tuple containing the cover image and cover page html template filenames. -# -# epub_cover = () - -# A sequence of (type, uri, title) tuples for the guide element of content.opf. -# -# epub_guide = () - -# HTML files that should be inserted before the pages created by sphinx. -# The format is a list of tuples containing the path and title. -# -# epub_pre_files = [] - -# HTML files that should be inserted after the pages created by sphinx. -# The format is a list of tuples containing the path and title. -# -# epub_post_files = [] - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] - -# The depth of the table of contents in toc.ncx. -# -# epub_tocdepth = 3 - -# Allow duplicate toc entries. -# -# epub_tocdup = True - -# Choose between 'default' and 'includehidden'. -# -# epub_tocscope = 'default' - -# Fix unsupported image types using the Pillow. -# -# epub_fix_images = False - -# Scale large images. -# -# epub_max_image_width = 0 - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# -# epub_show_urls = 'inline' - -# If false, no index is generated. -# -# epub_use_index = True diff --git a/docs/source/tutorial.rst b/docs/tutorial.rst similarity index 100% rename from docs/source/tutorial.rst rename to docs/tutorial.rst diff --git a/setup.cfg b/setup.cfg index ab06c92..f47bc4c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,13 +3,13 @@ current_version = 0.1.2-dev commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+))? -serialize = +serialize = {major}.{minor}.{patch}-{release} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release -values = +values = dev release @@ -26,7 +26,7 @@ universal = 1 [flake8] max-line-length = 99 -exclude = docs, .git, __pycache__, .ipynb_checkpoints +exclude = docs, .tox, .git, __pycache__, .ipynb_checkpoints ignore = # Keep empty to prevent default ignores [isort] diff --git a/tox.ini b/tox.ini index dca9da6..529610a 100644 --- a/tox.ini +++ b/tox.ini @@ -5,17 +5,17 @@ envlist = py27, py35, py36, docs, lint [travis] python = 3.6: py36, docs, lint - 3.5: py35 + 3.5: py35, 2.7: py27 [testenv] +passenv = CI TRAVIS TRAVIS_* setenv = PYTHONPATH = {toxinidir} -deps = - .[dev] +extras = tests commands = - /usr/bin/env python setup.py test + /usr/bin/env make test [testenv:lint] @@ -26,5 +26,6 @@ commands = [testenv:docs] skipsdist = true +extras = dev commands = /usr/bin/env make docs From c055a05ba8f689f7d343b3df294f2483d5a0b744 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 25 Apr 2019 19:27:35 +0200 Subject: [PATCH 10/44] No longer use of ATM instance, db instance instead. --- atm/api.py | 15 +++-- atm/cli.py | 168 ++++++++++++++++++++++++++++++++++++-------------- atm/config.py | 28 +++------ atm/models.py | 57 ++++++++--------- 4 files changed, 162 insertions(+), 106 deletions(-) diff --git a/atm/api.py b/atm/api.py index 9d8d8b3..e4e8f42 100644 --- a/atm/api.py +++ b/atm/api.py @@ -12,14 +12,13 @@ def make_absolute(url): return url -def create_app(atm): +def create_app(db): app = Flask(__name__) app.config['DEBUG'] = True - app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(atm.db.engine.url) - db = SQLAlchemy(app) + app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(db.engine.url) # Create the Flask-Restless API manager. - manager = APIManager(app, flask_sqlalchemy_db=db) + manager = APIManager(app, flask_sqlalchemy_db=SQLAlchemy(app)) # Create API endpoints, which will be available at /api/ by # default. Allowed HTTP methods can be specified as well. @@ -28,9 +27,9 @@ def create_app(atm): def swagger(): return redirect('/static/swagger/swagger-ui/index.html') - manager.create_api(atm.db.Dataset, methods=['GET']) - manager.create_api(atm.db.Datarun, methods=['GET']) - manager.create_api(atm.db.Hyperpartition, methods=['GET']) - manager.create_api(atm.db.Classifier, methods=['GET']) + manager.create_api(db.Dataset, methods=['GET']) + manager.create_api(db.Datarun, methods=['GET']) + manager.create_api(db.Hyperpartition, methods=['GET']) + manager.create_api(db.Classifier, methods=['GET']) return app diff --git a/atm/cli.py b/atm/cli.py index 6b33b3c..c67fa7f 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -2,39 +2,107 @@ import argparse import glob +import logging import os import shutil +import socket +import time +from multiprocessing import Pool, Process, Queue from atm.api import create_app from atm.config import ( - add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql) + add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql, + load_config) +from atm.database import Database from atm.models import ATM +from atm.worker import ClassifierError, Worker +LOGGER = logging.getLogger(__name__) -def _end_to_end_test(args): - """End to end test""" +def _get_db(args): + db_args = { + k[4:]: v + for k, v in vars(args).items() + if k.startswith('sql_') and v is not None + } + return Database(**db_args) -def _work(args): - atm = ATM(**vars(args)) - atm.work( - datarun_ids=args.dataruns, - choose_randomly=args.choose_randomly, - save_files=args.save_files, - cloud_mode=args.cloud_mode, - total_time=args.time, - wait=False - ) def _serve(args): - atm = ATM(**vars(args)) - app = create_app(atm) + db = _get_db(args) + app = create_app(db) app.run(host=args.host, port=args.port) +def _get_next_datarun(db): + dataruns = db.get_dataruns(ignore_complete=True) + if dataruns: + max_priority = max([datarun.priority for datarun in dataruns]) + priority_runs = [r for r in dataruns if r.priority == max_priority] + return priority_runs[0] + + +def _process_datarun(args, queue): + run_conf, aws_conf, log_conf = load_config(**vars(args)) + db = _get_db(args) + + while True: + datarun_id = queue.get(True) + + dataruns = db.get_dataruns(include_ids=[datarun_id]) + if dataruns: + datarun = dataruns[0] + + worker = Worker(db, datarun, save_files=args.save_files, + cloud_mode=args.cloud_mode, aws_config=aws_conf, + log_config=log_conf, public_ip=socket.gethostname()) + + try: + worker.run_classifier() + + except ClassifierError: + # the exception has already been handled; just wait a sec so we + # don't go out of control reporting errors + LOGGER.warning('Something went wrong. Sleeping %d seconds.', 1) + time.sleep(1) + + +def _worker_loop(args): + db = _get_db(args) + + queue = Queue(1) + LOGGER.info('Starting %s worker processes', args.workers) + with Pool(args.workers, _process_datarun, (args, queue, )): + while True: + datarun = _get_next_datarun(db) + + if not datarun: + time.sleep(1) + continue + + LOGGER.warning('Processing datarun %d', datarun.id) + db.mark_datarun_running(datarun.id) + + queue.put(datarun.id) + + +def _start(args): + if args.server: + LOGGER.info('Starting the REST API server') + process = Process(target=_serve, args=(args, )) + process.daemon = True + process.start() + + _worker_loop(args) + + def _enter_data(args): - atm = ATM(**vars(args)) + db = _get_db(args) + run_conf, aws_conf, log_conf = load_config(**vars(args)) + atm = ATM(db, run_conf, aws_conf, log_conf) + atm.enter_data() @@ -60,6 +128,8 @@ def _add_common_arguments(parser): def _get_parser(): parent = argparse.ArgumentParser(add_help=False) + parent.add_argument('-v', '--verbose', action='count', default=0) + parent.add_argument('-l', '--logfile') parser = argparse.ArgumentParser(description='ATM Command Line Interface') @@ -74,48 +144,56 @@ def _get_parser(): enter_data.add_argument('--run-per-partition', default=False, action='store_true', help='if set, generate a new datarun for each hyperpartition') - # Worker - worker = subparsers.add_parser('worker', parents=[parent]) - worker.set_defaults(action=_work) - _add_common_arguments(worker) - worker.add_argument('--cloud-mode', action='store_true', default=False, - help='Whether to run this worker in cloud mode') - - worker.add_argument('--dataruns', help='Only train on dataruns with these ids', nargs='+') - worker.add_argument('--time', help='Number of seconds to run worker', type=int) - worker.add_argument('--choose-randomly', action='store_true', - help='Choose dataruns to work on randomly (default = sequential order)') - - worker.add_argument('--no-save', dest='save_files', default=True, - action='store_const', const=False, - help="don't save models and metrics at all") - - # Server - server = subparsers.add_parser('server', parents=[parent]) - server.set_defaults(action=_serve) - server.add_argument('--host', help='IP to listen at') - server.add_argument('--port', help='Port to listen at', type=int) + # Start + start = subparsers.add_parser('start', parents=[parent]) + start.set_defaults(action=_start) + _add_common_arguments(start) + start.add_argument('--cloud-mode', action='store_true', default=False, + help='Whether to run this worker in cloud mode') + start.add_argument('--no-save', dest='save_files', default=True, + action='store_const', const=False, + help="don't save models and metrics at all") + start.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') + + start.add_argument('--server', action='store_true', + help='Also start the REST server') + start.add_argument('--host', help='IP to listen at') + start.add_argument('--port', help='Port to listen at', type=int) # Make Config make_config = subparsers.add_parser('make_config', parents=[parent]) make_config.set_defaults(action=_make_config) - # End to end test - end_to_end = subparsers.add_parser('end_to_end', parents=[parent]) - end_to_end.set_defaults(action=_end_to_end_test) - end_to_end.add_argument('--processes', help='number of processes to run concurrently', - type=int, default=4) + return parser - end_to_end.add_argument('--total-time', help='Total time for each worker to work in seconds.', - type=int, default=None) - return parser +def _logging_setup(verbosity=1, logfile=None): + logger = logging.getLogger() + log_level = (3 - verbosity) * 10 + fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s' + formatter = logging.Formatter(fmt) + logger.setLevel(log_level) + logger.propagate = False + + if logfile: + file_handler = logging.FileHandler(logfile) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + else: + console_handler = logging.StreamHandler() + console_handler.setLevel(log_level) + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) def main(): parser = _get_parser() args = parser.parse_args() + _logging_setup(args.verbose, args.logfile) + if not args.action: parser.print_help() parser.exit() diff --git a/atm/config.py b/atm/config.py index 7a39961..dc570d4 100644 --- a/atm/config.py +++ b/atm/config.py @@ -319,9 +319,9 @@ def add_arguments_sql(parser): # All of these arguments must start with --sql-, and must correspond to # keys present in the SQL config example file. - parser.add_argument('--sql-dialect', choices=SQL_DIALECTS, + parser.add_argument('--sql-dialect', choices=SQL_DIALECTS, default='sqlite', help='Dialect of SQL to use') - parser.add_argument('--sql-database', + parser.add_argument('--sql-database', default='atm.db', help='Name of, or path to, SQL database') parser.add_argument('--sql-username', help='Username for SQL database') parser.add_argument('--sql-password', help='Password for SQL database') @@ -494,23 +494,18 @@ def add_arguments_datarun(parser): return parser -def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kwargs): +def load_config(**kwargs): """ Load config objects from yaml files and command line arguments. Command line args override yaml files where applicable. Args: - sql_path: path to .yaml file with SQL configuration - run_path: path to .yaml file with Dataset and Datarun configuration - aws_path: path to .yaml file with AWS configuration - log_path: path to .yaml file with logging configuration **kwargs: miscellaneous arguments specifying individual configuration parameters. Any kwargs beginning with sql_ are SQL config arguments, any beginning with aws_ are AWS config. - Returns: sql_conf, run_conf, aws_conf, log_conf + Returns: run_conf, aws_conf, log_conf """ - sql_args = {} run_args = {} aws_args = {} log_args = {} @@ -521,15 +516,11 @@ def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kw kwargs = {k: v for k, v in list(kwargs.items()) if v is not None} # check the keyword args for config paths - sql_path = sql_path or kwargs.get('sql_config') - run_path = run_path or kwargs.get('run_config') - aws_path = aws_path or kwargs.get('aws_config') - log_path = log_path or kwargs.get('log_config') + run_path = kwargs.get('run_config') + aws_path = kwargs.get('aws_config') + log_path = kwargs.get('log_config') # load any yaml config files for which paths were provided - if sql_path: - with open(sql_path) as f: - sql_args = yaml.load(f) if run_path: with open(run_path) as f: @@ -544,8 +535,6 @@ def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kw log_args = yaml.load(f) # Use keyword args to override yaml config values - sql_args.update({k.replace('sql_', ''): v for k, v in list(kwargs.items()) - if 'sql_' in k}) aws_args.update({k.replace('aws_', ''): v for k, v in list(kwargs.items()) if 'aws_' in k}) run_args.update({k: v for k, v in list(kwargs.items()) if k in @@ -556,9 +545,8 @@ def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kw # It's ok if there are some extra arguments that get passed in here; only # kwargs that correspond to real config values will be stored on the config # objects. - sql_conf = SQLConfig(**sql_args) aws_conf = AWSConfig(**aws_args) run_conf = RunConfig(**run_args) log_conf = LogConfig(**log_args) - return sql_conf, run_conf, aws_conf, log_conf + return run_conf, aws_conf, log_conf diff --git a/atm/models.py b/atm/models.py index 399919c..69f792d 100644 --- a/atm/models.py +++ b/atm/models.py @@ -10,16 +10,13 @@ from past.utils import old_div -from atm.config import initialize_logging, load_config -from atm.constants import PROJECT_ROOT, TIME_FMT, PartitionStatus -from atm.database import Database +from atm.constants import TIME_FMT, PartitionStatus from atm.encoder import MetaData from atm.method import Method from atm.utilities import download_data, get_public_ip from atm.worker import ClassifierError, Worker -# load the library-wide logger -logger = logging.getLogger('atm') +LOGGER = logging.getLogger(__name__) class ATM(object): @@ -29,17 +26,11 @@ class ATM(object): LOOP_WAIT = 1 - def __init__(self, **kwargs): - - if kwargs.get('log_config') is None: - kwargs['log_config'] = os.path.join(PROJECT_ROOT, - 'config/templates/log-script.yaml') - - self.sql_conf, self.run_conf, self.aws_conf, self.log_conf = load_config(**kwargs) - - self.db = Database(**vars(self.sql_conf)) - - initialize_logging(self.log_conf) + def __init__(self, db, run_conf, aws_conf, log_conf): + self.db = db + self.run_conf = run_conf + self.aws_conf = aws_conf + self.log_conf = log_conf def work(self, datarun_ids=None, save_files=False, choose_randomly=True, cloud_mode=False, total_time=None, wait=True): @@ -71,13 +62,13 @@ def work(self, datarun_ids=None, save_files=False, choose_randomly=True, dataruns = self.db.get_dataruns(include_ids=datarun_ids, ignore_complete=True) if not dataruns: if wait: - logger.warning('No dataruns found. Sleeping %d seconds and trying again.', + LOGGER.warning('No dataruns found. Sleeping %d seconds and trying again.', ATM.LOOP_WAIT) time.sleep(ATM.LOOP_WAIT) continue else: - logger.warning('No dataruns found. Exiting.') + LOGGER.warning('No dataruns found. Exiting.') break max_priority = max([datarun.priority for datarun in dataruns]) @@ -92,7 +83,7 @@ def work(self, datarun_ids=None, save_files=False, choose_randomly=True, # say we've started working on this datarun, if we haven't already self.db.mark_datarun_running(run.id) - logger.info('Computing on datarun %d' % run.id) + LOGGER.info('Computing on datarun %d' % run.id) # actual work happens here worker = Worker(self.db, run, save_files=save_files, cloud_mode=cloud_mode, aws_config=self.aws_conf, @@ -103,12 +94,12 @@ def work(self, datarun_ids=None, save_files=False, choose_randomly=True, except ClassifierError: # the exception has already been handled; just wait a sec so we # don't go out of control reporting errors - logger.warning('Something went wrong. Sleeping %d seconds.', ATM.LOOP_WAIT) + LOGGER.warning('Something went wrong. Sleeping %d seconds.', ATM.LOOP_WAIT) time.sleep(ATM.LOOP_WAIT) elapsed_time = (datetime.now() - start_time).total_seconds() if total_time is not None and elapsed_time >= total_time: - logger.warning('Total run time for worker exceeded; exiting.') + LOGGER.warning('Total run time for worker exceeded; exiting.') break def create_dataset(self): @@ -197,16 +188,16 @@ def enter_data(self, run_per_partition=False): # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() - logger.info('method %s has %d hyperpartitions' % + LOGGER.info('method %s has %d hyperpartitions' % (m, len(method_parts[m]))) # create hyperpartitions and datarun(s) run_ids = [] if not run_per_partition: - logger.debug('saving datarun...') + LOGGER.debug('saving datarun...') datarun = self.create_datarun(dataset) - logger.debug('saving hyperpartions...') + LOGGER.debug('saving hyperpartions...') for method, parts in list(method_parts.items()): for part in parts: # if necessary, create a new datarun for each hyperpartition. @@ -223,19 +214,19 @@ def enter_data(self, run_per_partition=False): categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) - logger.info('Data entry complete. Summary:') - logger.info('\tDataset ID: %d', dataset.id) - logger.info('\tTraining data: %s', dataset.train_path) - logger.info('\tTest data: %s', (dataset.test_path or 'None')) + LOGGER.info('Data entry complete. Summary:') + LOGGER.info('\tDataset ID: %d', dataset.id) + LOGGER.info('\tTraining data: %s', dataset.train_path) + LOGGER.info('\tTest data: %s', (dataset.test_path or 'None')) if run_per_partition: - logger.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids))) + LOGGER.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids))) else: - logger.info('\tDatarun ID: %d', datarun.id) + LOGGER.info('\tDatarun ID: %d', datarun.id) - logger.info('\tHyperpartition selection strategy: %s', datarun.selector) - logger.info('\tParameter tuning strategy: %s', datarun.tuner) - logger.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) + LOGGER.info('\tHyperpartition selection strategy: %s', datarun.selector) + LOGGER.info('\tParameter tuning strategy: %s', datarun.tuner) + LOGGER.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) return run_ids or datarun.id From 2a923d9d668892a39f59aea87448e9d9a9c044cc Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 25 Apr 2019 19:30:31 +0200 Subject: [PATCH 11/44] Added extras on lint. --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 529610a..a820509 100644 --- a/tox.ini +++ b/tox.ini @@ -20,6 +20,7 @@ commands = [testenv:lint] skipsdist = true +extras = dev commands = /usr/bin/env make lint From 28af5f4c8e6e726967ee70e19136c3ec32fd6582 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 26 Apr 2019 11:06:20 +0200 Subject: [PATCH 12/44] Added cov=atm --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 94a2957..e53722c 100644 --- a/Makefile +++ b/Makefile @@ -106,7 +106,7 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test test: ## run tests quickly with the default Python - python -m pytest tests + python -m pytest --cov=atm .PHONY: test-all test-all: ## run tests on every Python version with tox From 02ab6450ecd469e0cf3a502ffc4cb034d3e740c7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 26 Apr 2019 12:26:14 +0200 Subject: [PATCH 13/44] Added travis badge. --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7b9d2ea..42eff1f 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ [![CircleCI][circleci-img]][circleci-url] +[![Travis](https://travis-ci.org/HDI-Project/ATM.svg?branch=master)](https://travis-ci.org/HDI-Project/ATM) [![Coverage status][codecov-img]][codecov-url] [![Documentation][rtd-img]][rtd-url] From 4c6ec6a61a31a5c4f733fc6c03f5788e129a3e90 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 26 Apr 2019 15:27:28 +0200 Subject: [PATCH 14/44] Working on starting multiple clusters and atm as background service --- atm/api.py | 4 +-- atm/cli.py | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 86 insertions(+), 6 deletions(-) diff --git a/atm/api.py b/atm/api.py index e4e8f42..e09685d 100644 --- a/atm/api.py +++ b/atm/api.py @@ -12,9 +12,9 @@ def make_absolute(url): return url -def create_app(db): +def create_app(db, debug=False): app = Flask(__name__) - app.config['DEBUG'] = True + app.config['DEBUG'] = debug app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(db.engine.url) # Create the Flask-Restless API manager. diff --git a/atm/cli.py b/atm/cli.py index c67fa7f..fa61a7d 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -9,6 +9,10 @@ import time from multiprocessing import Pool, Process, Queue +import daemon +import psutil +from daemon.pidfile import PIDLockFile + from atm.api import create_app from atm.config import ( add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql, @@ -29,10 +33,25 @@ def _get_db(args): return Database(**db_args) +def _work(args): + db = _get_db(args) + run_conf, aws_conf, log_conf = load_config(**vars(args)) + + atm = ATM(db, run_conf, aws_conf, log_conf) + + atm.work( + datarun_ids=args.dataruns, + choose_randomly=False, + save_files=args.save_files, + cloud_mode=args.cloud_mode, + total_time=args.time, + wait=False + ) + def _serve(args): db = _get_db(args) - app = create_app(db) + app = create_app(db, False) app.run(host=args.host, port=args.port) @@ -88,16 +107,51 @@ def _worker_loop(args): queue.put(datarun.id) -def _start(args): +def _stop(args): + """Stop the current running process of ATM.""" + pid_path = args.pid + if not os.path.isabs(pid_path): + pid_path = os.path.join(os.getcwd(), args.pid) + + try: + with open(pid_path, 'r') as f: + pid = int(f.read()) + + process = psutil.Process(pid) + command = process.as_dict().get('cmdline') + if 'atm' in command and 'start' in command: + process.kill() + + print('ATM stopped successfully') + + except (FileNotFoundError, psutil.NoSuchProcess) as e: + print('ATM process not found, try different pid file?') + + +def _start_background(args): if args.server: LOGGER.info('Starting the REST API server') + process = Process(target=_serve, args=(args, )) process.daemon = True + process.start() _worker_loop(args) +def _start(args): + + pid_path = args.pid + if not os.path.isabs(pid_path): + pid_path = os.path.join(os.getcwd(), args.pid) + + pid_file = PIDLockFile(pid_path) + + with daemon.DaemonContext(pidfile=pid_file, working_directory=os.getcwd()): + _start_background(args) + + def _enter_data(args): db = _get_db(args) run_conf, aws_conf, log_conf = load_config(**vars(args)) @@ -144,6 +198,27 @@ def _get_parser(): enter_data.add_argument('--run-per-partition', default=False, action='store_true', help='if set, generate a new datarun for each hyperpartition') + # Worker + worker = subparsers.add_parser('worker', parents=[parent]) + worker.set_defaults(action=_work) + _add_common_arguments(worker) + worker.add_argument('--cloud-mode', action='store_true', default=False, + help='Whether to run this worker in cloud mode') + + worker.add_argument('--dataruns', help='Only train on dataruns with these ids', nargs='+') + worker.add_argument('--time', help='Number of seconds to run worker', type=int) + + worker.add_argument('--no-save', dest='save_files', action='store_false', + help="don't save models and metrics at all") + + # Server + server = subparsers.add_parser('server', parents=[parent]) + server.set_defaults(action=_serve) + _add_common_arguments(server) + server.add_argument('--host', help='IP to listen at') + server.add_argument('--port', help='Port to listen at', type=int) + server.add_argument('--debug', action='store_true', help='Start the server in debug mode.') + # Start start = subparsers.add_parser('start', parents=[parent]) start.set_defaults(action=_start) @@ -155,10 +230,15 @@ def _get_parser(): help="don't save models and metrics at all") start.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') - start.add_argument('--server', action='store_true', - help='Also start the REST server') + start.add_argument('--server', action='store_true', help='Also start the REST server') start.add_argument('--host', help='IP to listen at') start.add_argument('--port', help='Port to listen at', type=int) + start.add_argument('--pid', help='PID file to use.', default='atm.pid') + + # Stop + stop = subparsers.add_parser('stop', parents=[parent]) + stop.set_defaults(action=_stop) + stop.add_argument('--pid', help='PID file to use.', default='atm.pid') # Make Config make_config = subparsers.add_parser('make_config', parents=[parent]) From 42ae066a61f42598d2263e211ebdf4991282be0c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Fri, 26 Apr 2019 20:10:40 +0200 Subject: [PATCH 15/44] Update tox.ini --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index a820509..234d290 100644 --- a/tox.ini +++ b/tox.ini @@ -15,7 +15,7 @@ setenv = PYTHONPATH = {toxinidir} extras = tests commands = - /usr/bin/env make test + /usr/bin/env python setup.py test [testenv:lint] From 6b033798898774417721859aad602a01a0e6e56b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 28 Apr 2019 18:02:10 +0200 Subject: [PATCH 16/44] Fix travis build failure --- .travis.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.travis.yml b/.travis.yml index f3df730..039d7e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,6 +6,11 @@ python: - 3.5 - 3.6 +# Fix travis failure when using boto3 on travis +# see https://github.com/travis-ci/travis-ci/issues/7940 +before_install: + - sudo rm -f /etc/boto.cfg + # Command to install dependencies install: pip install -U tox-travis codecov From 044acadae6ef6b6c110085d007360d52b1e7a5ee Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 28 Apr 2019 18:47:35 +0200 Subject: [PATCH 17/44] Ignore docs build folder --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index e1da78f..6a88783 100755 --- a/.gitignore +++ b/.gitignore @@ -92,7 +92,7 @@ instance/ .scrapy # Sphinx documentation -docs/build/ +docs/_build/ docs/modules.rst docs/atm.rst docs/atm.*.rst From f25d6364378ce4b73b6c90ecca07cbcf9b2b020a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 28 Apr 2019 22:05:37 +0200 Subject: [PATCH 18/44] Fix docs deploy from travis --- .travis.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index 039d7e5..3d2119e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,18 +1,13 @@ # Config file for automatic testing at travis-ci.org language: python -dist: xenial python: - 2.7 - 3.5 - 3.6 -# Fix travis failure when using boto3 on travis -# see https://github.com/travis-ci/travis-ci/issues/7940 -before_install: - - sudo rm -f /etc/boto.cfg - # Command to install dependencies -install: pip install -U tox-travis codecov +install: + - pip install -U tox-travis codecov google-compute-engine # Command to run tests script: tox From 946c1184655769ee97e2def5e7de3aef9b22845b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 28 Apr 2019 22:10:52 +0200 Subject: [PATCH 19/44] Fix travis build --- .travis.yml | 2 +- setup.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 3d2119e..c0cc0b3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -7,7 +7,7 @@ python: # Command to install dependencies install: - - pip install -U tox-travis codecov google-compute-engine + - pip install -U tox-travis codecov # Command to run tests script: tox diff --git a/setup.py b/setup.py index ef7929d..4f44958 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ 'pytest-runner>=3.0', 'pytest-xdist>=1.20.1', 'pytest>=3.2.3', + 'google-compute-engine==2.8.12', # required by travis ] development_requires = [ From 288927d954cb5c4761fafcc9ad1cbf1e89f50130 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Sun, 28 Apr 2019 23:01:57 +0200 Subject: [PATCH 20/44] Update links to docs --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 42eff1f..b14c472 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,8 @@ [![CircleCI][circleci-img]][circleci-url] -[![Travis](https://travis-ci.org/HDI-Project/ATM.svg?branch=master)](https://travis-ci.org/HDI-Project/ATM) +[![Travis][travis-img]][travis-url] [![Coverage status][codecov-img]][codecov-url] -[![Documentation][rtd-img]][rtd-url] [circleci-img]: https://circleci.com/gh/HDI-Project/ATM.svg?style=shield [circleci-url]: https://circleci.com/gh/HDI-Project/ATM @@ -18,14 +17,12 @@ [pypi-url]: https://pypi.python.org/pypi/atm [codecov-img]: https://codecov.io/gh/HDI-project/ATM/branch/master/graph/badge.svg [codecov-url]: https://codecov.io/gh/HDI-project/ATM -[rtd-img]: https://readthedocs.org/projects/atm/badge/?version=latest -[rtd-url]: http://atm.readthedocs.io/en/latest/ # ATM - Auto Tune Models - Free software: MIT license -- Documentation: http://atm.readthedocs.io/en/latest/ +- Documentation: https://hdi-project.github.io/ATM/ ATM is an open source software library under the From af7797888f58b47152706a4440f37a101e17cccb Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Mon, 29 Apr 2019 16:29:35 +0200 Subject: [PATCH 21/44] Updated requriements, working on processing. --- atm/cli.py | 128 ++++++++++++++++++++++++++++++++++++++++++----------- setup.py | 2 + 2 files changed, 103 insertions(+), 27 deletions(-) diff --git a/atm/cli.py b/atm/cli.py index fa61a7d..1e71120 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -1,17 +1,19 @@ # -*- coding: utf-8 -*- import argparse +import errno import glob import logging +import multiprocessing import os import shutil +import signal import socket import time -from multiprocessing import Pool, Process, Queue -import daemon import psutil -from daemon.pidfile import PIDLockFile +from daemon import DaemonContext +from lockfile.pidlockfile import PIDLockFile from atm.api import create_app from atm.config import ( @@ -91,9 +93,9 @@ def _process_datarun(args, queue): def _worker_loop(args): db = _get_db(args) - queue = Queue(1) + queue = multiprocessing.Queue(1) LOGGER.info('Starting %s worker processes', args.workers) - with Pool(args.workers, _process_datarun, (args, queue, )): + with multiprocessing.Pool(args.workers, _process_datarun, (args, queue, )): while True: datarun = _get_next_datarun(db) @@ -107,32 +109,98 @@ def _worker_loop(args): queue.put(datarun.id) -def _stop(args): - """Stop the current running process of ATM.""" - pid_path = args.pid +def _get_pid_path(pid): + """Returns abspath of the pid file which is stored on the cwd.""" + pid_path = pid + if not os.path.isabs(pid_path): - pid_path = os.path.join(os.getcwd(), args.pid) + pid_path = os.path.join(os.getcwd(), pid_path) - try: - with open(pid_path, 'r') as f: - pid = int(f.read()) + return pid_path - process = psutil.Process(pid) - command = process.as_dict().get('cmdline') - if 'atm' in command and 'start' in command: - process.kill() - print('ATM stopped successfully') +def _get_next(cmd_line, _position): + next_position = _position + 1 + + if len(cmd_line) >= next_position: + return cmd_line[next_position] + + +def _status_check(args, return_process=False): + """Check the status if there is an ATM server running.""" + pid_path = _get_pid_path(args.pid) + + pid_file = PIDLockFile(pid_path) + + if pid_file.is_locked(): + try: + pid = pid_file.read_pid() + process = psutil.Process(pid) + cmd_line = process.cmdline() + + if return_process: + return process + + workers = 1 # Default + server = False + host = '127.0.0.0' + port = '8000' + + for _position in range(len(cmd_line)): + if cmd_line[_position] == '-w' or cmd_line[_position] == ['--workers']: + workers = _get_next(cmd_line, _position) or 1 + + if cmd_line[_position] == '--server': + server = True + + if server: + if cmd_line[_position] == '--host': + host = _get_next(cmd_line, _position) + + if cmd_line[_position] == '--port': + port = _get_next(cmd_line, _position) + + if workers != 1: + print('ATM is currently runing with {} workers.'.format(workers)) - except (FileNotFoundError, psutil.NoSuchProcess) as e: - print('ATM process not found, try different pid file?') + else: + print('ATM is currently runing with 1 worker.') + + if server: + print('ATM Server is running at http://{}:{}'.format(host, port)) + + return True + + except psutil.NoSuchProcess: + print('ATM process not running for the indicated PID file.') + return False + + +def _status(args): + """Check if the current ATM process is runing.""" + if not _status_check(args): + print('ATM is not runing at the moment.') + + +def _stop(args): + """Stop the current running process of ATM.""" + process = _status_check(args, return_process=True) + + if process: + try: + process.terminate() + print('ATM process stopped correctly.') + + except psutil.: + time.sleep(3) + process.kill() def _start_background(args): if args.server: LOGGER.info('Starting the REST API server') - process = Process(target=_serve, args=(args, )) + process = multiprocessing.Process(target=_serve, args=(args, )) process.daemon = True process.start() @@ -141,15 +209,16 @@ def _start_background(args): def _start(args): + if not _status_check(args): + pid_path = _get_pid_path(args.pid) + pid_file = PIDLockFile(pid_path) - pid_path = args.pid - if not os.path.isabs(pid_path): - pid_path = os.path.join(os.getcwd(), args.pid) - - pid_file = PIDLockFile(pid_path) + context = DaemonContext() + context.pidfile = pid_file + context.working_directory = os.getcwd() - with daemon.DaemonContext(pidfile=pid_file, working_directory=os.getcwd()): - _start_background(args) + with context: + _start_background(args) def _enter_data(args): @@ -235,6 +304,11 @@ def _get_parser(): start.add_argument('--port', help='Port to listen at', type=int) start.add_argument('--pid', help='PID file to use.', default='atm.pid') + # Status + status = subparsers.add_parser('status', parents=[parent]) + status.set_defaults(action=_status) + status.add_argument('--pid', help='PID file to use.', default='atm.pid') + # Stop stop = subparsers.add_parser('stop', parents=[parent]) stop.set_defaults(action=_stop) diff --git a/setup.py b/setup.py index ef7929d..9ae2f65 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,8 @@ 'mysqlclient>=1.2', 'numpy>=1.13.1', 'pandas>=0.22.0', + 'psutil>=5.6.1', + 'python-daemon>=2.2.3', 'pyyaml>=3.12', 'requests>=2.18.4', 'scikit-learn>=0.18.2', From aa50db6dfcd710e58ca7273f1237f91aa44f7077 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 29 Apr 2019 18:02:49 +0200 Subject: [PATCH 22/44] Fix start/stop calls --- .gitignore | 3 ++ atm/cli.py | 137 +++++++++++++++++++++++++++++------------------------ 2 files changed, 78 insertions(+), 62 deletions(-) diff --git a/.gitignore b/.gitignore index e1da78f..fc0707b 100755 --- a/.gitignore +++ b/.gitignore @@ -136,3 +136,6 @@ ENV/ # mypy .mypy_cache/ + +# pid +*.pid diff --git a/atm/cli.py b/atm/cli.py index 1e71120..097bbb5 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -119,81 +119,49 @@ def _get_pid_path(pid): return pid_path -def _get_next(cmd_line, _position): - next_position = _position + 1 - - if len(cmd_line) >= next_position: - return cmd_line[next_position] - - -def _status_check(args, return_process=False): - """Check the status if there is an ATM server running.""" - pid_path = _get_pid_path(args.pid) - +def _get_atm_process(pid_path): pid_file = PIDLockFile(pid_path) if pid_file.is_locked(): + pid = pid_file.read_pid() + try: - pid = pid_file.read_pid() process = psutil.Process(pid) - cmd_line = process.cmdline() - - if return_process: + if process.name() == 'atm': return process - - workers = 1 # Default - server = False - host = '127.0.0.0' - port = '8000' - - for _position in range(len(cmd_line)): - if cmd_line[_position] == '-w' or cmd_line[_position] == ['--workers']: - workers = _get_next(cmd_line, _position) or 1 - - if cmd_line[_position] == '--server': - server = True - - if server: - if cmd_line[_position] == '--host': - host = _get_next(cmd_line, _position) - - if cmd_line[_position] == '--port': - port = _get_next(cmd_line, _position) - - if workers != 1: - print('ATM is currently runing with {} workers.'.format(workers)) - else: - print('ATM is currently runing with 1 worker.') - - if server: - print('ATM Server is running at http://{}:{}'.format(host, port)) - - return True + pid_file.break_lock() except psutil.NoSuchProcess: - print('ATM process not running for the indicated PID file.') - return False + pid_file.break_lock() def _status(args): """Check if the current ATM process is runing.""" - if not _status_check(args): - print('ATM is not runing at the moment.') - -def _stop(args): - """Stop the current running process of ATM.""" - process = _status_check(args, return_process=True) + pid_path = _get_pid_path(args.pid) + process = _get_atm_process(pid_path) if process: - try: - process.terminate() - print('ATM process stopped correctly.') + workers = 0 + addr = None + for child in process.children(): + connections = child.connections() + if connections: + connection = connections[0] + addr = connection.laddr + + else: + workers += 1 + + s = 's' if workers > 1 else '' + print('ATM is running with {} worker{}'.format(workers, s)) - except psutil.: - time.sleep(3) - process.kill() + if addr: + print('ATM REST server is listening on http://{}:{}'.format(addr.ip, addr.port)) + + else: + print('ATM is not runing.') def _start_background(args): @@ -205,12 +173,18 @@ def _start_background(args): process.start() - _worker_loop(args) + if args.workers: + _worker_loop(args) def _start(args): - if not _status_check(args): - pid_path = _get_pid_path(args.pid) + pid_path = _get_pid_path(args.pid) + process = _get_atm_process(pid_path) + + if process: + print('ATM is already running!') + + else: pid_file = PIDLockFile(pid_path) context = DaemonContext() @@ -218,9 +192,44 @@ def _start(args): context.working_directory = os.getcwd() with context: + # Set up default logs + if not args.logfile: + _logging_setup(args.verbose, 'atm.log') + + print('Starting ATM') _start_background(args) +def _stop(args): + """Stop the current running process of ATM.""" + pid_path = _get_pid_path(args.pid) + process = _get_atm_process(pid_path) + + if process: + process.terminate() + + for _ in range(args.timeout): + if process.is_running(): + time.sleep(1) + else: + break + + if process.is_running(): + print('ATM was not able to stop after {} seconds.'.format(args.timeout)) + if args.force: + print('Killing it.') + process.kill() + else: + print('Use --force to kill it.') + + else: + print('ATM stopped correctly.') + + else: + print('ATM is not running.') + + + def _enter_data(args): db = _get_db(args) run_conf, aws_conf, log_conf = load_config(**vars(args)) @@ -313,6 +322,10 @@ def _get_parser(): stop = subparsers.add_parser('stop', parents=[parent]) stop.set_defaults(action=_stop) stop.add_argument('--pid', help='PID file to use.', default='atm.pid') + stop.add_argument('-t', '--timeout', default=5, type=int, + help='Seconds to wait before killing the process.') + stop.add_argument('-f', '--force', action='store_true', + help='Kill the process if it does not terminate gracefully.') # Make Config make_config = subparsers.add_parser('make_config', parents=[parent]) @@ -323,7 +336,7 @@ def _get_parser(): def _logging_setup(verbosity=1, logfile=None): logger = logging.getLogger() - log_level = (3 - verbosity) * 10 + log_level = (2 - verbosity) * 10 fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s' formatter = logging.Formatter(fmt) logger.setLevel(log_level) From 13fd6ec34904cc6c6748a8e30779b9e411e32aaf Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 11:19:53 +0200 Subject: [PATCH 23/44] Fix lint / modified tests to adapt to new ATM class. --- atm/cli.py | 3 - atm/enter_data.py | 161 ------------------- tests/{test_enter_data.py => test_models.py} | 20 ++- tests/test_worker.py | 5 +- 4 files changed, 18 insertions(+), 171 deletions(-) delete mode 100644 atm/enter_data.py rename tests/{test_enter_data.py => test_models.py} (91%) diff --git a/atm/cli.py b/atm/cli.py index 097bbb5..8d44521 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -1,13 +1,11 @@ # -*- coding: utf-8 -*- import argparse -import errno import glob import logging import multiprocessing import os import shutil -import signal import socket import time @@ -229,7 +227,6 @@ def _stop(args): print('ATM is not running.') - def _enter_data(args): db = _get_db(args) run_conf, aws_conf, log_conf = load_config(**vars(args)) diff --git a/atm/enter_data.py b/atm/enter_data.py deleted file mode 100644 index af5bb10..0000000 --- a/atm/enter_data.py +++ /dev/null @@ -1,161 +0,0 @@ -from __future__ import absolute_import, division, unicode_literals - -import logging -import os -from builtins import map -from datetime import datetime, timedelta - -from past.utils import old_div - -from atm.constants import TIME_FMT, PartitionStatus -from atm.database import Database -from atm.encoder import MetaData -from atm.method import Method -from atm.utilities import download_data - -# load the library-wide logger -logger = logging.getLogger('atm') - - -def create_dataset(db, run_config, aws_config=None): - """ - Create a dataset and add it to the ModelHub database. - - db: initialized Database object - run_config: RunConfig object describing the dataset to create - aws_config: optional. AWS credentials for downloading data from S3. - """ - # download data to the local filesystem to extract metadata - train_local, test_local = download_data(run_config.train_path, - run_config.test_path, - aws_config) - - # create the name of the dataset from the path to the data - name = os.path.basename(train_local) - name = name.replace("_train.csv", "").replace(".csv", "") - - # process the data into the form ATM needs and save it to disk - meta = MetaData(run_config.class_column, train_local, test_local) - - # enter dataset into database - dataset = db.create_dataset(name=name, - description=run_config.data_description, - train_path=run_config.train_path, - test_path=run_config.test_path, - class_column=run_config.class_column, - n_examples=meta.n_examples, - k_classes=meta.k_classes, - d_features=meta.d_features, - majority=meta.majority, - size_kb=old_div(meta.size, 1000)) - return dataset - - -def create_datarun(db, dataset, run_config): - """ - Given a config, creates a set of dataruns for the config and enters them into - the database. Returns the ID of the created datarun. - - db: initialized Database object - dataset: Dataset SQLAlchemy ORM object - run_config: RunConfig object describing the datarun to create - """ - # describe the datarun by its tuner and selector - run_description = '__'.join([run_config.tuner, run_config.selector]) - - # set the deadline, if applicable - deadline = run_config.deadline - if deadline: - deadline = datetime.strptime(deadline, TIME_FMT) - # this overrides the otherwise configured budget_type - # TODO: why not walltime and classifiers budget simultaneously? - run_config.budget_type = 'walltime' - elif run_config.budget_type == 'walltime': - deadline = datetime.now() + timedelta(minutes=run_config.budget) - - target = run_config.score_target + '_judgment_metric' - datarun = db.create_datarun(dataset_id=dataset.id, - description=run_description, - tuner=run_config.tuner, - selector=run_config.selector, - gridding=run_config.gridding, - priority=run_config.priority, - budget_type=run_config.budget_type, - budget=run_config.budget, - deadline=deadline, - metric=run_config.metric, - score_target=target, - k_window=run_config.k_window, - r_minimum=run_config.r_minimum) - return datarun - - -def enter_data(sql_config, run_config, aws_config=None, - run_per_partition=False): - """ - Generate a datarun, including a dataset if necessary. - - sql_config: Object with all attributes necessary to initialize a Database. - run_config: all attributes necessary to initialize a Datarun, including - Dataset info if the dataset has not already been created. - aws_config: all attributes necessary to connect to an S3 bucket. - - Returns: ID of the generated datarun - """ - # connect to the database - db = Database(sql_config.dialect, sql_config.database, sql_config.username, - sql_config.password, sql_config.host, sql_config.port, - sql_config.query) - - # if the user has provided a dataset id, use that. Otherwise, create a new - # dataset based on the arguments we were passed. - if run_config.dataset_id is None: - dataset = create_dataset(db, run_config, aws_config=aws_config) - run_config.dataset_id = dataset.id - else: - dataset = db.get_dataset(run_config.dataset_id) - - method_parts = {} - for m in run_config.methods: - # enumerate all combinations of categorical variables for this method - method = Method(m) - method_parts[m] = method.get_hyperpartitions() - logger.info('method %s has %d hyperpartitions' % - (m, len(method_parts[m]))) - - # create hyperpartitions and datarun(s) - run_ids = [] - if not run_per_partition: - logger.debug('saving datarun...') - datarun = create_datarun(db, dataset, run_config) - - logger.debug('saving hyperpartions...') - for method, parts in list(method_parts.items()): - for part in parts: - # if necessary, create a new datarun for each hyperpartition. - # This setting is useful for debugging. - if run_per_partition: - datarun = create_datarun(db, dataset, run_config) - run_ids.append(datarun.id) - - # create a new hyperpartition in the database - db.create_hyperpartition(datarun_id=datarun.id, - method=method, - tunables=part.tunables, - constants=part.constants, - categoricals=part.categoricals, - status=PartitionStatus.INCOMPLETE) - - logger.info('Data entry complete. Summary:') - logger.info('\tDataset ID: %d' % dataset.id) - logger.info('\tTraining data: %s' % dataset.train_path) - logger.info('\tTest data: %s' % (dataset.test_path or 'None')) - if run_per_partition: - logger.info('\tDatarun IDs: %s' % ', '.join(map(str, run_ids))) - else: - logger.info('\tDatarun ID: %d' % datarun.id) - logger.info('\tHyperpartition selection strategy: %s' % datarun.selector) - logger.info('\tParameter tuning strategy: %s' % datarun.tuner) - logger.info('\tBudget: %d (%s)' % (datarun.budget, datarun.budget_type)) - - return run_ids or datarun.id diff --git a/tests/test_enter_data.py b/tests/test_models.py similarity index 91% rename from tests/test_enter_data.py rename to tests/test_models.py index 7dfb303..da988a0 100644 --- a/tests/test_enter_data.py +++ b/tests/test_models.py @@ -5,7 +5,7 @@ from atm import PROJECT_ROOT from atm.config import RunConfig, SQLConfig from atm.database import Database, db_session -from atm.enter_data import create_dataset, enter_data +from atm.models import ATM from atm.utilities import get_local_data_path DB_PATH = '/tmp/atm.db' @@ -64,7 +64,10 @@ def test_create_dataset(db): test_path=test_url, data_description='test', class_column='class') - dataset = create_dataset(db, run_conf) + + atm = ATM(db, run_conf, None, None) + + dataset = atm.create_dataset() dataset = db.get_dataset(dataset.id) assert os.path.exists(train_path_local) @@ -85,9 +88,11 @@ def test_enter_data_by_methods(dataset): db = Database(**vars(sql_conf)) run_conf = RunConfig(dataset_id=dataset.id) + atm = ATM(db, run_conf, None, None) + for method, n_parts in METHOD_HYPERPARTS.items(): run_conf.methods = [method] - run_id = enter_data(sql_conf, run_conf) + run_id = atm.enter_data() assert db.get_datarun(run_id) with db_session(db): @@ -102,7 +107,9 @@ def test_enter_data_all(dataset): run_conf = RunConfig(dataset_id=dataset.id, methods=METHOD_HYPERPARTS.keys()) - run_id = enter_data(sql_conf, run_conf) + atm = ATM(db, run_conf, None, None) + + run_id = atm.enter_data() with db_session(db): run = db.get_datarun(run_id) @@ -113,9 +120,12 @@ def test_enter_data_all(dataset): def test_run_per_partition(dataset): sql_conf = SQLConfig(database=DB_PATH) db = Database(**vars(sql_conf)) + run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg']) - run_ids = enter_data(sql_conf, run_conf, run_per_partition=True) + atm = ATM(db, run_conf, None, None) + + run_ids = atm.enter_data(run_per_partition=True) with db_session(db): runs = [] diff --git a/tests/test_worker.py b/tests/test_worker.py index 2bc5e30..ca1bb8a 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -15,7 +15,7 @@ from atm.config import LogConfig, RunConfig, SQLConfig from atm.constants import METRICS_BINARY, TIME_FMT from atm.database import Database, db_session -from atm.enter_data import enter_data +from atm.models import ATM from atm.utilities import download_data, load_metrics, load_model from atm.worker import ClassifierError, Worker @@ -109,8 +109,9 @@ def get_new_worker(**kwargs): kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt']) sql_conf = SQLConfig(database=DB_PATH) run_conf = RunConfig(**kwargs) - run_id = enter_data(sql_conf, run_conf) db = Database(**vars(sql_conf)) + atm = ATM(db, run_conf, None, None) + run_id = atm.enter_data() datarun = db.get_datarun(run_id) return Worker(db, datarun) From 01fe4a376ce621553379875821aa5cc414db94d1 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 15:03:38 +0200 Subject: [PATCH 24/44] Added API documentation to the github pages. Update on API.md about new daemon process. --- API.md | 179 +++++++++++++++++++++++++++++++++++++++++++++++++ atm/cli.py | 13 +++- docs/api.rst | 1 + docs/index.rst | 1 + 4 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 docs/api.rst diff --git a/API.md b/API.md index c679dc9..a78976c 100644 --- a/API.md +++ b/API.md @@ -278,3 +278,182 @@ And the output will be (note that some parts have been cut): "total_pages": 1 } ``` + + +## Starting the REST API Server and Workers in daemon + +**ATM** comes with the possibility to start a daemon process (in background) with the workers +and the REST API server. This will allow you to update dynamicly the database while new dataruns +are created for the workers. + +### 1. Start the ATM process + +By default **ATM** launches one worker in background if we just run the following command: + +```bash +atm start +``` + +After starting this process, we can type: + +```bash +atm status +``` + +And an output like this will be displayed in our console: + +``` +ATM is running with 1 worker +``` + +In order to stop this process just run: + +```bash +atm stop +``` + +An output like this should be printed in the console: + +``` +ATM stopped correctly. +``` + +### 2. Start the ATM process with more than one worker + +If we would like to launch more than one worker, we can use the argument `--workers WORKERS` or +`-w WORKERS`. + +```bash +atm start -w 4 +``` + +**Bear in mind**, if the `atm` process is allready running, a message indicating so will be +displayed when trying to start a new process. + +Then if you check the `status` of `atm`: + +```bash +atm status +``` + +The expected output is: + +``` +ATM is running with 4 workers +``` + + +### 3. Start the ATM process with the REST API server + +The `atm start` command accepts as an argument `--server` which will launch alongside the workers +the same REST API server as described before. + +```bash +atm start --server +``` + +If you run `atm status` to check it's status the expected output should be as follows: + +``` +ATM is running with 1 worker +ATM REST server is listening on http://127.0.0.1:5000 +``` + +### 4. Additional arguments for ATM Start + +* `--sql-config SQL_CONFIG` Path to yaml SQL config file. +* `--sql-dialect {sqlite,mysql}` Dialect of SQL to use. +* `--sql-database SQL_DATABASE` Name of, or path to, SQL database. +* `--sql-username SQL_USERNAME` Username for SQL database. +* `--sql-password SQL_PASSWORD` Password for SQL database. + +* `--sql-host SQL_HOST` Hostname for database machine. +* `--sql-port SQL_PORT` Port used to connect to database. + +* `--sql-query SQL_QUERY` Specify extra login details. +* `--aws-config AWS_CONFIG` path to yaml AWS config file. +* `--aws-access-key AWS_ACCESS_KEY` AWS access key. +* `--aws-secret-key AWS_SECRET_KEY` AWS secret key. +* `--aws-s3-bucket AWS_S3_BUCKET` AWS S3 bucket to store data. +* `--aws-s3-folder AWS_S3_FOLDER` Folder in AWS S3 bucket in which to store data. + +* `--log-config LOG_CONFIG` path to yaml logging config file. +* `--model-dir MODEL_DIR` Directory where computed models will be saved. +* `--metric-dir METRIC_DIR` Directory where model metrics will be saved. +* `--log-dir LOG_DIR` Directory where logs will be saved. + +* `--verbose-metrics` If set, compute full ROC and PR curves and per-label +metrics for each classifier. + +* `--log-level-file` {critical,error,warning,info,debug,none} minimum log level to write to the +log file. + +* `--log-level-stdout` {critical,error,warning,info,debug,none} +minimum log level to write to stdout. + +* `--cloud-mode` Wheter to run this worker/s in cloud mode. +* `--no-save` Do not save models and metrics at all. +* `-w WORKERS` `--workers WORKERS` Number of workers. +* `--server` Also start the REST server. +* `--host HOST` IP to listen at. +* `--port PORT` Port to listen at. +* `--pid PID` PID file to use (we can use a different one in order to launch more than one process. + + +### 4. Stop the ATM process + +As we saw before, by runing the command `atm stop` we will `terminate` the ATM process. However +this command accepts a few arguments in order to control this behaviour: + +* `-t TIMEOUT`, `--timeout TIMEOUT`, time to wait in order to check if the process has been +terminated. + +* `-f`, `--force`, Kill the process if it does not terminate gracefully. + +### 5. Starting multiple ATM processes + +**ATM** also has the posibility to launch more than one process. In order to do so, we use a `pid` +file. + +By default, the `pid` file used by **ATM** is called `atm.pid`, however, you can change this name +by adding the argument `--pid` when starting **ATM**. + +For example, we will start our ATM with the default values (1 worker and `atm.pid`): + +```bash +atm start +``` + +If we run the status, this will display the following information: + +``` +ATM is running with 1 worker +``` + +Now if we would like to wake more workers we can run: + +```bash +atm start --workers 4 --pid additional_workers.pid +``` + +In order to run the `atm status` for this `pid` add it as argument to it: + +```bash +atm status --pid additional_workers.pid +``` + +The output of this command will be: + +``` +ATM is running with 4 workers +``` + +As you can see you will have now 5 workers running as the `SQL` configuration is the same and this +will be pointing to that database. + +In order to stop the `additional_workers` process, we run `atm stop` with the `pid` file as +argument: + +```bash +atm stop --pid additional_workers.pid +``` diff --git a/atm/cli.py b/atm/cli.py index 8d44521..3c2a63f 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -25,6 +25,7 @@ def _get_db(args): + """Returns an instance of Database with the given args.""" db_args = { k[4:]: v for k, v in vars(args).items() @@ -34,6 +35,7 @@ def _get_db(args): def _work(args): + """Creates a single worker on the current terminal / window.""" db = _get_db(args) run_conf, aws_conf, log_conf = load_config(**vars(args)) @@ -50,12 +52,14 @@ def _work(args): def _serve(args): + """Launch the ATM API with the given host / port.""" db = _get_db(args) app = create_app(db, False) app.run(host=args.host, port=args.port) def _get_next_datarun(db): + """Get the following datarun with the max priority.""" dataruns = db.get_dataruns(ignore_complete=True) if dataruns: max_priority = max([datarun.priority for datarun in dataruns]) @@ -64,6 +68,7 @@ def _get_next_datarun(db): def _process_datarun(args, queue): + """Process the datarun with the worker.""" run_conf, aws_conf, log_conf = load_config(**vars(args)) db = _get_db(args) @@ -89,6 +94,9 @@ def _process_datarun(args, queue): def _worker_loop(args): + """We create a multiprocessing Queue and then a pool with the number of workers specified + by the args which stay on a loop listening for new entries inside the database. + """ db = _get_db(args) queue = multiprocessing.Queue(1) @@ -118,6 +126,7 @@ def _get_pid_path(pid): def _get_atm_process(pid_path): + """Return `psutil.Process` of the `pid` file. If the pidfile is stale it will release it.""" pid_file = PIDLockFile(pid_path) if pid_file.is_locked(): @@ -163,6 +172,7 @@ def _status(args): def _start_background(args): + """Launches the server/worker in daemon process.""" if args.server: LOGGER.info('Starting the REST API server') @@ -176,6 +186,7 @@ def _start_background(args): def _start(args): + """Create a new process of ATM pointing the process to a certain `pid` file.""" pid_path = _get_pid_path(args.pid) process = _get_atm_process(pid_path) @@ -333,7 +344,7 @@ def _get_parser(): def _logging_setup(verbosity=1, logfile=None): logger = logging.getLogger() - log_level = (2 - verbosity) * 10 + log_level = (3 - verbosity) * 10 fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s' formatter = logging.Formatter(fmt) logger.setLevel(log_level) diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..7040425 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1 @@ +.. mdinclude:: ../API.md diff --git a/docs/index.rst b/docs/index.rst index fb99d67..78d2f93 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,6 +15,7 @@ Contents: setup quickstart database + api contributing add_method add_to_btb From cbf0493d1761539b1b2465ee294d2846729eb728 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 18:26:53 +0200 Subject: [PATCH 25/44] API update and fix docs. --- API.md | 226 +++++++++++++++-------------------------------------- Makefile | 6 +- atm/cli.py | 7 +- 3 files changed, 70 insertions(+), 169 deletions(-) diff --git a/API.md b/API.md index a78976c..202a0ad 100644 --- a/API.md +++ b/API.md @@ -11,36 +11,48 @@ In order to start a REST API server, after installing ATM open a terminal, activ virtualenv, and execute this command: ```bash -atm server +atm start ``` -An output similar to this one should apear in the terminal: + +This will start **ATM** server as a background service. The REST server will be listening at the +port 5000 of your machine, and if you point your browser at http://127.0.0.1:5000/, you will see +the documentation website that shows information about all the REST operations allowed by the API. + +Optionally, the `--port ` can be added to modify the port which the server listents at: ```bash - * Serving Flask app "api.setup" (lazy loading) - * Environment: production - WARNING: Do not use the development server in a production environment. - Use a production WSGI server instead. - * Debug mode: on - * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit) - * Restarting with stat - * Debugger is active! - * Debugger PIN: 150-127-826 +atm start --port 1234 ``` -After this, the REST server will be listening at the port 5000 of you machine, and if you -point your browser at http://127.0.0.1:5000/, you will see the documentation -website that shows information about all the REST operations allowed by the API. +If you would like to see the status of the server process you can run: -Optionally, the `--port ` can be added to modify the port which the server listents at: +```bash +atm status +``` + +An output similar to this one will appear: + +```bash +ATM is running with 1 worker +ATM REST server is listening on http://127.0.0.1:5000 +``` + +In order to stop the server you can run the following command: ```bash -atm server --port 1234 +atm stop ``` -In order to stop the server you can press Ctrl+c, but for now -you can keep it running and head to the next section. +Notice that `atm start` will start one worker by default. If you would like to launch more than one, +you can do so by adding the argument `--workers `. +```bash +atm start --workers 4 +``` + +For more detailed options you can run `atm start --help` to obtain a list with the arguments +that are being accepted. ## Quickstart @@ -56,13 +68,15 @@ Before proceeding any further, please make sure the have already populated your at least one model tuning process. An easy way to do this is to follow the quickstart from the ATM [README.md](README.md) file, -which means having run these two commands: +which means having run these command: ``` atm enter_data -atm worker ``` +The workers that you started before will proceed the data that has been inserted and will populate +the database. + ### 2. REST Models Once the database is populated, you can use the REST API to explore the following 4 models: @@ -280,180 +294,66 @@ And the output will be (note that some parts have been cut): ``` -## Starting the REST API Server and Workers in daemon - -**ATM** comes with the possibility to start a daemon process (in background) with the workers -and the REST API server. This will allow you to update dynamicly the database while new dataruns -are created for the workers. - -### 1. Start the ATM process - -By default **ATM** launches one worker in background if we just run the following command: - -```bash -atm start -``` - -After starting this process, we can type: - -```bash -atm status -``` +## Additional information -And an output like this will be displayed in our console: +### Start additional process with different pid file -``` -ATM is running with 1 worker -``` +If you would like to run more workers or you would like to launch a second **ATM** process, you can +do so by specifying a different `PID` file. -In order to stop this process just run: +For example: ```bash -atm stop -``` - -An output like this should be printed in the console: - +atm start --no-server -w 4 --pid additional_workers.pid ``` -ATM stopped correctly. -``` - -### 2. Start the ATM process with more than one worker -If we would like to launch more than one worker, we can use the argument `--workers WORKERS` or -`-w WORKERS`. +To check the status of this process we have to run: ```bash -atm start -w 4 +atm status --pid additional_workers.pid ``` -**Bear in mind**, if the `atm` process is allready running, a message indicating so will be -displayed when trying to start a new process. - -Then if you check the `status` of `atm`: +This will print an output like this: ```bash -atm status -``` - -The expected output is: - -``` ATM is running with 4 workers ``` +### Stop the ATM process -### 3. Start the ATM process with the REST API server - -The `atm start` command accepts as an argument `--server` which will launch alongside the workers -the same REST API server as described before. - -```bash -atm start --server -``` - -If you run `atm status` to check it's status the expected output should be as follows: - -``` -ATM is running with 1 worker -ATM REST server is listening on http://127.0.0.1:5000 -``` - -### 4. Additional arguments for ATM Start - -* `--sql-config SQL_CONFIG` Path to yaml SQL config file. -* `--sql-dialect {sqlite,mysql}` Dialect of SQL to use. -* `--sql-database SQL_DATABASE` Name of, or path to, SQL database. -* `--sql-username SQL_USERNAME` Username for SQL database. -* `--sql-password SQL_PASSWORD` Password for SQL database. - -* `--sql-host SQL_HOST` Hostname for database machine. -* `--sql-port SQL_PORT` Port used to connect to database. - -* `--sql-query SQL_QUERY` Specify extra login details. -* `--aws-config AWS_CONFIG` path to yaml AWS config file. -* `--aws-access-key AWS_ACCESS_KEY` AWS access key. -* `--aws-secret-key AWS_SECRET_KEY` AWS secret key. -* `--aws-s3-bucket AWS_S3_BUCKET` AWS S3 bucket to store data. -* `--aws-s3-folder AWS_S3_FOLDER` Folder in AWS S3 bucket in which to store data. - -* `--log-config LOG_CONFIG` path to yaml logging config file. -* `--model-dir MODEL_DIR` Directory where computed models will be saved. -* `--metric-dir METRIC_DIR` Directory where model metrics will be saved. -* `--log-dir LOG_DIR` Directory where logs will be saved. - -* `--verbose-metrics` If set, compute full ROC and PR curves and per-label -metrics for each classifier. - -* `--log-level-file` {critical,error,warning,info,debug,none} minimum log level to write to the -log file. - -* `--log-level-stdout` {critical,error,warning,info,debug,none} -minimum log level to write to stdout. - -* `--cloud-mode` Wheter to run this worker/s in cloud mode. -* `--no-save` Do not save models and metrics at all. -* `-w WORKERS` `--workers WORKERS` Number of workers. -* `--server` Also start the REST server. -* `--host HOST` IP to listen at. -* `--port PORT` Port to listen at. -* `--pid PID` PID file to use (we can use a different one in order to launch more than one process. - - -### 4. Stop the ATM process - -As we saw before, by runing the command `atm stop` we will `terminate` the ATM process. However +As we saw before, by runing the command `atm stop` you will `terminate` the ATM process. However this command accepts a few arguments in order to control this behaviour: * `-t TIMEOUT`, `--timeout TIMEOUT`, time to wait in order to check if the process has been terminated. * `-f`, `--force`, Kill the process if it does not terminate gracefully. +* `--pid PIDFILE`, PID file to use -### 5. Starting multiple ATM processes - -**ATM** also has the posibility to launch more than one process. In order to do so, we use a `pid` -file. - -By default, the `pid` file used by **ATM** is called `atm.pid`, however, you can change this name -by adding the argument `--pid` when starting **ATM**. - -For example, we will start our ATM with the default values (1 worker and `atm.pid`): - -```bash -atm start -``` - -If we run the status, this will display the following information: - -``` -ATM is running with 1 worker -``` +### Start the ATM REST API server in foreground -Now if we would like to wake more workers we can run: +If you would like to monitorize the server for debugging process, you can do so by runing the +with the following command: ```bash -atm start --workers 4 --pid additional_workers.pid +atm server ``` -In order to run the `atm status` for this `pid` add it as argument to it: +An output similar to this one should apear in the terminal: ```bash -atm status --pid additional_workers.pid -``` - -The output of this command will be: - -``` -ATM is running with 4 workers + * Serving Flask app "api.setup" (lazy loading) + * Environment: production + WARNING: Do not use the development server in a production environment. + Use a production WSGI server instead. + * Debug mode: on + * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit) + * Restarting with stat + * Debugger is active! + * Debugger PIN: 150-127-826 ``` -As you can see you will have now 5 workers running as the `SQL` configuration is the same and this -will be pointing to that database. +For additional arguments run `atm server --help` -In order to stop the `additional_workers` process, we run `atm stop` with the `pid` file as -argument: - -```bash -atm stop --pid additional_workers.pid -``` +**Note** that this command will not launch any `workers` process. In order to launch a foreground +worker you have to do so by runing `atm worker`. diff --git a/Makefile b/Makefile index e53722c..9f972ef 100644 --- a/Makefile +++ b/Makefile @@ -48,9 +48,9 @@ clean-pyc: ## remove Python file artifacts .PHONY: clean-docs clean-docs: ## remove previously built docs - rm -rf docs/build - rm -f docs/atm.rst - rm -f docs/atm.*.rst + rm -rf docs/_build + rm -f docs/api/atm.rst + rm -f docs/api/atm.*.rst rm -f docs/modules.rst $(MAKE) -C docs clean diff --git a/atm/cli.py b/atm/cli.py index 3c2a63f..2f3a901 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -54,7 +54,7 @@ def _work(args): def _serve(args): """Launch the ATM API with the given host / port.""" db = _get_db(args) - app = create_app(db, False) + app = create_app(db, args.debug) app.run(host=args.host, port=args.port) @@ -173,7 +173,7 @@ def _status(args): def _start_background(args): """Launches the server/worker in daemon process.""" - if args.server: + if not args.no_server: LOGGER.info('Starting the REST API server') process = multiprocessing.Process(target=_serve, args=(args, )) @@ -316,10 +316,11 @@ def _get_parser(): help="don't save models and metrics at all") start.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') - start.add_argument('--server', action='store_true', help='Also start the REST server') + start.add_argument('--no-server', action='store_true', help='Do not start the REST server') start.add_argument('--host', help='IP to listen at') start.add_argument('--port', help='Port to listen at', type=int) start.add_argument('--pid', help='PID file to use.', default='atm.pid') + start.add_argument('--debug', action='store_true', help='Start the server in debug mode.') # Status status = subparsers.add_parser('status', parents=[parent]) From 6adcba7396cb31b5aa36062fa87152b472621148 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 19:06:45 +0200 Subject: [PATCH 26/44] Added restart option. --- API.md | 16 ++++++++++++++++ atm/cli.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/API.md b/API.md index 202a0ad..ba97adf 100644 --- a/API.md +++ b/API.md @@ -319,6 +319,22 @@ This will print an output like this: ATM is running with 4 workers ``` +### Restart the ATM process + +If you have an **ATM** process running and you would like to restart it and add more workers to it +or maybe change the port on which is running, you can achieve so with the `atm restart`: + +```bash +atm restart +``` + +This command will restart the server with the default values, so if you would like to use other +options you can run `--help` to see the accepted arguments: + +```bash +atm restart --help +``` + ### Stop the ATM process As we saw before, by runing the command `atm stop` you will `terminate` the ATM process. However diff --git a/atm/cli.py b/atm/cli.py index 2f3a901..acec0b8 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -209,6 +209,12 @@ def _start(args): _start_background(args) +def _restart(args): + if _stop(args): + time.sleep(1) + _start(args) + + def _stop(args): """Stop the current running process of ATM.""" pid_path = _get_pid_path(args.pid) @@ -228,11 +234,14 @@ def _stop(args): if args.force: print('Killing it.') process.kill() + return True + else: print('Use --force to kill it.') else: print('ATM stopped correctly.') + return True else: print('ATM is not running.') @@ -327,6 +336,26 @@ def _get_parser(): status.set_defaults(action=_status) status.add_argument('--pid', help='PID file to use.', default='atm.pid') + # restart + restart = subparsers.add_parser('restart', parents=[parent]) + restart.set_defaults(action=_restart) + _add_common_arguments(restart) + restart.add_argument('--cloud-mode', action='store_true', default=False, + help='Whether to run this worker in cloud mode') + restart.add_argument('--no-save', dest='save_files', default=True, + action='store_const', const=False, + help="don't save models and metrics at all") + restart.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') + restart.add_argument('--no-server', action='store_true', help='Do not start the REST server') + restart.add_argument('--host', help='IP to listen at') + restart.add_argument('--port', help='Port to listen at', type=int) + restart.add_argument('--pid', help='PID file to use.', default='atm.pid') + restart.add_argument('--debug', action='store_true', help='restart the server in debug mode.') + restart.add_argument('-t', '--timeout', default=5, type=int, + help='Seconds to wait before killing the process.') + restart.add_argument('-f', '--force', action='store_true', + help='Kill the process if it does not terminate gracefully.') + # Stop stop = subparsers.add_parser('stop', parents=[parent]) stop.set_defaults(action=_stop) From 4928c4a6ad32f2e2a46487c016267576bece4743 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 19:12:48 +0200 Subject: [PATCH 27/44] Fix lint --- atm/cli.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/atm/cli.py b/atm/cli.py index acec0b8..37689bf 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -341,10 +341,10 @@ def _get_parser(): restart.set_defaults(action=_restart) _add_common_arguments(restart) restart.add_argument('--cloud-mode', action='store_true', default=False, - help='Whether to run this worker in cloud mode') + help='Whether to run this worker in cloud mode') restart.add_argument('--no-save', dest='save_files', default=True, - action='store_const', const=False, - help="don't save models and metrics at all") + action='store_const', const=False, + help="don't save models and metrics at all") restart.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') restart.add_argument('--no-server', action='store_true', help='Do not start the REST server') restart.add_argument('--host', help='IP to listen at') From ab25e27248c9bd9e7fb018da4816a810e67d5ceb Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Tue, 30 Apr 2019 19:33:31 +0200 Subject: [PATCH 28/44] Creating preprocessor for teh dataset post --- atm/api.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/atm/api.py b/atm/api.py index 9d8d8b3..14a8d81 100644 --- a/atm/api.py +++ b/atm/api.py @@ -1,9 +1,14 @@ import os +from past.utils import old_div -from flask import Flask, redirect +from flask import Flask, abort, redirect, request from flask_restless_swagger import SwagAPIManager as APIManager from flask_sqlalchemy import SQLAlchemy +from atm.encoder import MetaData + +DATASET_KEYS = ['name', 'description', 'train_path', 'class_column'] + def make_absolute(url): if str(url).startswith('sqlite:///'): @@ -12,6 +17,28 @@ def make_absolute(url): return url +def dataset_preprocessor(data): + """Preprocess the post data.""" + if all(key in data for key in DATASET_KEYS): + meta = MetaData( + data['class_column'], + data['train_path'], + data.get('test_path') + ) + + data['n_examples'] = meta.n_examples + data['k_classes'] = meta.k_classes + data['d_features'] = meta.d_features + data['majority'] = meta.majority + data['size_kb'] = old_div(meta.size, 1000) + + else: + abort(400) + + +DATASET_PREPROCESSOR = {'POST_RESOURCE': [dataset_preprocessor]} + + def create_app(atm): app = Flask(__name__) app.config['DEBUG'] = True @@ -24,11 +51,17 @@ def create_app(atm): # Create API endpoints, which will be available at /api/ by # default. Allowed HTTP methods can be specified as well. + @app.route('/api/search', methods=['POST']) + def create_datarun(): + if not request.json: + abort(400) + return + @app.route('/') def swagger(): return redirect('/static/swagger/swagger-ui/index.html') - manager.create_api(atm.db.Dataset, methods=['GET']) + manager.create_api(atm.db.Dataset, methods=['GET', 'POST'], preprocessors=DATASET_PREPROCESSOR) manager.create_api(atm.db.Datarun, methods=['GET']) manager.create_api(atm.db.Hyperpartition, methods=['GET']) manager.create_api(atm.db.Classifier, methods=['GET']) From 5eb0237be7d43eea88e39a745487876a28d0c399 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 1 May 2019 22:44:47 +0200 Subject: [PATCH 29/44] Refactoring configuration and adding api/run endpoint --- atm/{api.py => api/__init__.py} | 59 +-- atm/api/preprocessors.py | 29 ++ atm/classifier.py | 3 +- atm/cli.py | 329 +++++++------ atm/config.py | 794 ++++++++++++-------------------- atm/{models.py => core.py} | 123 ++--- atm/encoder.py | 6 +- atm/metrics.py | 3 +- atm/worker.py | 1 + 9 files changed, 577 insertions(+), 770 deletions(-) rename atm/{api.py => api/__init__.py} (51%) create mode 100644 atm/api/preprocessors.py rename atm/{models.py => core.py} (69%) diff --git a/atm/api.py b/atm/api/__init__.py similarity index 51% rename from atm/api.py rename to atm/api/__init__.py index 069552f..581ac64 100644 --- a/atm/api.py +++ b/atm/api/__init__.py @@ -1,13 +1,11 @@ import os -from past.utils import old_div -from flask import Flask, abort, redirect, request +from flask import Flask, abort, jsonify, redirect, request from flask_restless_swagger import SwagAPIManager as APIManager from flask_sqlalchemy import SQLAlchemy -from atm.encoder import MetaData - -DATASET_KEYS = ['name', 'description', 'train_path', 'class_column'] +from atm.api.preprocessors import DATASET_PREPROCESSORS +from atm.config import RunConfig def make_absolute(url): @@ -17,50 +15,39 @@ def make_absolute(url): return url -def dataset_preprocessor(data): - """Preprocess the post data.""" - if all(key in data for key in DATASET_KEYS): - meta = MetaData( - data['class_column'], - data['train_path'], - data.get('test_path') - ) - - data['n_examples'] = meta.n_examples - data['k_classes'] = meta.k_classes - data['d_features'] = meta.d_features - data['majority'] = meta.majority - data['size_kb'] = old_div(meta.size, 1000) - - else: - abort(400) - - -DATASET_PREPROCESSOR = {'POST_RESOURCE': [dataset_preprocessor]} - - -def create_app(db, debug=False): +def create_app(atm): + db = atm.db app = Flask(__name__) - app.config['DEBUG'] = debug app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(db.engine.url) # Create the Flask-Restless API manager. manager = APIManager(app, flask_sqlalchemy_db=SQLAlchemy(app)) - # Create API endpoints, which will be available at /api/ by - # default. Allowed HTTP methods can be specified as well. - - @app.route('/api/search', methods=['POST']) - def create_datarun(): + @app.route('/api/run', methods=['POST']) + def atm_run(): if not request.json: abort(400) - return + + data = request.json + run_per_partition = data.get('run_per_partition', False) + run_conf = RunConfig(data) + + dataruns = atm.create_dataruns(run_conf, run_per_partition) + + response = { + 'status': 'OK', + 'datarun_ids': [datarun.id for datarun in dataruns] + } + + return jsonify(response) @app.route('/') def swagger(): return redirect('/static/swagger/swagger-ui/index.html') - manager.create_api(db.Dataset, methods=['GET', 'POST'], preprocessors=DATASET_PREPROCESSOR) + # Create API endpoints, which will be available at /api/ by + # default. Allowed HTTP methods can be specified as well. + manager.create_api(db.Dataset, methods=['GET', 'POST'], preprocessors=DATASET_PREPROCESSORS) manager.create_api(db.Datarun, methods=['GET']) manager.create_api(db.Hyperpartition, methods=['GET']) manager.create_api(db.Classifier, methods=['GET']) diff --git a/atm/api/preprocessors.py b/atm/api/preprocessors.py new file mode 100644 index 0000000..ccde6e1 --- /dev/null +++ b/atm/api/preprocessors.py @@ -0,0 +1,29 @@ +from flask import abort + +from atm.encoder import MetaData + +DATASET_KEYS = ['name', 'description', 'train_path', 'class_column'] + + +def dataset_post(data): + """Preprocess the Dataset POST data.""" + if all(key in data for key in DATASET_KEYS): + meta = MetaData( + data['class_column'], + data['train_path'], + data.get('test_path') + ) + + data['n_examples'] = meta.n_examples + data['k_classes'] = meta.k_classes + data['d_features'] = meta.d_features + data['majority'] = meta.majority + data['size_kb'] = meta.size + + else: + abort(400) + + +DATASET_PREPROCESSORS = { + 'POST': [dataset_post] +} diff --git a/atm/classifier.py b/atm/classifier.py index cc25af3..1bee3f9 100644 --- a/atm/classifier.py +++ b/atm/classifier.py @@ -14,7 +14,6 @@ import numpy as np import pandas as pd -from past.utils import old_div from sklearn import decomposition from sklearn.gaussian_process.kernels import ( RBF, ConstantKernel, ExpSineSquared, Matern, RationalQuadratic) @@ -159,7 +158,7 @@ def test_final_model(self, X, y): # time the prediction start_time = time.time() total = time.time() - start_time - self.avg_predict_time = old_div(total, float(len(y))) + self.avg_predict_time = total / float(len(y)) # TODO: this is hacky. See https://github.com/HDI-Project/ATM/issues/48 binary = self.num_classes == 2 diff --git a/atm/cli.py b/atm/cli.py index 37689bf..1df9ee4 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -6,7 +6,6 @@ import multiprocessing import os import shutil -import socket import time import psutil @@ -14,107 +13,56 @@ from lockfile.pidlockfile import PIDLockFile from atm.api import create_app -from atm.config import ( - add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql, - load_config) -from atm.database import Database -from atm.models import ATM -from atm.worker import ClassifierError, Worker +# from atm.config import ( +# add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql, +# load_config) +from atm.config import AWSConfig, DatasetConfig, LogConfig, RunConfig, SQLConfig +from atm.core import ATM LOGGER = logging.getLogger(__name__) -def _get_db(args): - """Returns an instance of Database with the given args.""" - db_args = { - k[4:]: v - for k, v in vars(args).items() - if k.startswith('sql_') and v is not None - } - return Database(**db_args) +# def _get_db(args): +# """Returns an instance of Database with the given args.""" +# db_args = { +# k[4:]: v +# for k, v in vars(args).items() +# if k.startswith('sql_') and v is not None +# } +# return Database(**db_args) -def _work(args): - """Creates a single worker on the current terminal / window.""" - db = _get_db(args) - run_conf, aws_conf, log_conf = load_config(**vars(args)) +def _get_atm(args): + # db = _get_db(args) + # run_conf, aws_conf, log_conf = load_config(**vars(args)) + sql_conf = SQLConfig(args) + aws_conf = AWSConfig(args) + log_conf = LogConfig(args) + return ATM(sql_conf, aws_conf, log_conf) - atm = ATM(db, run_conf, aws_conf, log_conf) + +def _work(args, wait=False): + """Creates a single worker.""" + atm = _get_atm(args) atm.work( - datarun_ids=args.dataruns, + datarun_ids=getattr(args, 'dataruns', None), choose_randomly=False, save_files=args.save_files, cloud_mode=args.cloud_mode, - total_time=args.time, - wait=False + total_time=getattr(args, 'total_time', None), + wait=wait ) def _serve(args): """Launch the ATM API with the given host / port.""" - db = _get_db(args) - app = create_app(db, args.debug) + # db = _get_db(args) + atm = _get_atm(args) + app = create_app(atm) app.run(host=args.host, port=args.port) -def _get_next_datarun(db): - """Get the following datarun with the max priority.""" - dataruns = db.get_dataruns(ignore_complete=True) - if dataruns: - max_priority = max([datarun.priority for datarun in dataruns]) - priority_runs = [r for r in dataruns if r.priority == max_priority] - return priority_runs[0] - - -def _process_datarun(args, queue): - """Process the datarun with the worker.""" - run_conf, aws_conf, log_conf = load_config(**vars(args)) - db = _get_db(args) - - while True: - datarun_id = queue.get(True) - - dataruns = db.get_dataruns(include_ids=[datarun_id]) - if dataruns: - datarun = dataruns[0] - - worker = Worker(db, datarun, save_files=args.save_files, - cloud_mode=args.cloud_mode, aws_config=aws_conf, - log_config=log_conf, public_ip=socket.gethostname()) - - try: - worker.run_classifier() - - except ClassifierError: - # the exception has already been handled; just wait a sec so we - # don't go out of control reporting errors - LOGGER.warning('Something went wrong. Sleeping %d seconds.', 1) - time.sleep(1) - - -def _worker_loop(args): - """We create a multiprocessing Queue and then a pool with the number of workers specified - by the args which stay on a loop listening for new entries inside the database. - """ - db = _get_db(args) - - queue = multiprocessing.Queue(1) - LOGGER.info('Starting %s worker processes', args.workers) - with multiprocessing.Pool(args.workers, _process_datarun, (args, queue, )): - while True: - datarun = _get_next_datarun(db) - - if not datarun: - time.sleep(1) - continue - - LOGGER.warning('Processing datarun %d', datarun.id) - db.mark_datarun_running(datarun.id) - - queue.put(datarun.id) - - def _get_pid_path(pid): """Returns abspath of the pid file which is stored on the cwd.""" pid_path = pid @@ -127,7 +75,7 @@ def _get_pid_path(pid): def _get_atm_process(pid_path): """Return `psutil.Process` of the `pid` file. If the pidfile is stale it will release it.""" - pid_file = PIDLockFile(pid_path) + pid_file = PIDLockFile(pid_path, timeout=1.0) if pid_file.is_locked(): pid = pid_file.read_pid() @@ -172,8 +120,8 @@ def _status(args): def _start_background(args): - """Launches the server/worker in daemon process.""" - if not args.no_server: + """Launches the server/worker in daemon processes.""" + if args.server: LOGGER.info('Starting the REST API server') process = multiprocessing.Process(target=_serve, args=(args, )) @@ -181,8 +129,13 @@ def _start_background(args): process.start() - if args.workers: - _worker_loop(args) + pool = multiprocessing.Pool(args.workers) + for _ in range(args.workers): + LOGGER.info('Starting background worker') + pool.apply_async(_work, args=(args, True)) + + pool.close() + pool.join() def _start(args): @@ -194,25 +147,20 @@ def _start(args): print('ATM is already running!') else: - pid_file = PIDLockFile(pid_path) - - context = DaemonContext() - context.pidfile = pid_file - context.working_directory = os.getcwd() - - with context: - # Set up default logs - if not args.logfile: - _logging_setup(args.verbose, 'atm.log') + print('Starting ATM') - print('Starting ATM') + if args.foreground: _start_background(args) + else: + pidfile = PIDLockFile(pid_path, timeout=1.0) -def _restart(args): - if _stop(args): - time.sleep(1) - _start(args) + with DaemonContext(pidfile=pidfile, working_directory=os.getcwd()): + # Set up default log file if not already set + if not args.logfile: + _logging_setup(args.verbose, 'atm.log') + + _start_background(args) def _stop(args): @@ -234,25 +182,35 @@ def _stop(args): if args.force: print('Killing it.') process.kill() - return True else: print('Use --force to kill it.') else: print('ATM stopped correctly.') - return True else: print('ATM is not running.') -def _enter_data(args): - db = _get_db(args) - run_conf, aws_conf, log_conf = load_config(**vars(args)) - atm = ATM(db, run_conf, aws_conf, log_conf) +def _restart(args): + _stop(args) + time.sleep(1) + + pid_path = _get_pid_path(args.pid) + process = _get_atm_process(pid_path) + + if process: + print('ATM did not stop correctly. Aborting') + else: + _start(args) - atm.enter_data() + +def _enter_data(args): + atm = _get_atm(args) + run_conf = RunConfig(args) + dataset_conf = DatasetConfig(args) + atm.enter_data(dataset_conf, run_conf, args.run_per_partition) def _make_config(args): @@ -269,104 +227,125 @@ def _make_config(args): # load other functions from config.py -def _add_common_arguments(parser): - add_arguments_sql(parser) - add_arguments_aws_s3(parser) - add_arguments_logging(parser) +# def _add_common_arguments(parser): +# add_arguments_sql(parser) +# add_arguments_aws_s3(parser) +# add_arguments_logging(parser) def _get_parser(): - parent = argparse.ArgumentParser(add_help=False) - parent.add_argument('-v', '--verbose', action='count', default=0) - parent.add_argument('-l', '--logfile') + logging_args = argparse.ArgumentParser(add_help=False) + logging_args.add_argument('-v', '--verbose', action='count', default=0) + logging_args.add_argument('-l', '--logfile') parser = argparse.ArgumentParser(description='ATM Command Line Interface') subparsers = parser.add_subparsers(title='action', help='Action to perform') parser.set_defaults(action=None) + # Common Arguments + sql_args = SQLConfig.get_parser() + aws_args = AWSConfig.get_parser() + log_args = LogConfig.get_parser() + run_args = RunConfig.get_parser() + dataset_args = DatasetConfig.get_parser() + # Enter Data Parser - enter_data = subparsers.add_parser('enter_data', parents=[parent]) + enter_data_parents = [ + logging_args, + sql_args, + aws_args, + dataset_args, + log_args, + run_args + ] + enter_data = subparsers.add_parser('enter_data', parents=enter_data_parents) enter_data.set_defaults(action=_enter_data) - _add_common_arguments(enter_data) - add_arguments_datarun(enter_data) + # _add_common_arguments(enter_data) + # add_arguments_datarun(enter_data) enter_data.add_argument('--run-per-partition', default=False, action='store_true', help='if set, generate a new datarun for each hyperpartition') + # Wroker Args + worker_args = argparse.ArgumentParser(add_help=False) + worker_args.add_argument('--cloud-mode', action='store_true', default=False, + help='Whether to run this worker in cloud mode') + worker_args.add_argument('--no-save', dest='save_files', default=True, + action='store_const', const=False, + help="don't save models and metrics at all") + # Worker - worker = subparsers.add_parser('worker', parents=[parent]) + worker_parents = [ + logging_args, + worker_args, + sql_args, + aws_args, + log_args + ] + worker = subparsers.add_parser('worker', parents=worker_parents) worker.set_defaults(action=_work) - _add_common_arguments(worker) - worker.add_argument('--cloud-mode', action='store_true', default=False, - help='Whether to run this worker in cloud mode') - + # _add_common_arguments(worker) worker.add_argument('--dataruns', help='Only train on dataruns with these ids', nargs='+') - worker.add_argument('--time', help='Number of seconds to run worker', type=int) + worker.add_argument('--total-time', help='Number of seconds to run worker', type=int) - worker.add_argument('--no-save', dest='save_files', action='store_false', - help="don't save models and metrics at all") + # Server Args + server_args = argparse.ArgumentParser(add_help=False) + server_args.add_argument('--host', help='IP to listen at') + server_args.add_argument('--port', help='Port to listen at', type=int) # Server - server = subparsers.add_parser('server', parents=[parent]) + server = subparsers.add_parser('server', parents=[logging_args, server_args, sql_args]) server.set_defaults(action=_serve) - _add_common_arguments(server) - server.add_argument('--host', help='IP to listen at') - server.add_argument('--port', help='Port to listen at', type=int) - server.add_argument('--debug', action='store_true', help='Start the server in debug mode.') + # add_arguments_sql(server) + + # Background Args + background_args = argparse.ArgumentParser(add_help=False) + background_args.add_argument('--pid', help='PID file to use.', default='atm.pid') + + # Start Args + start_args = argparse.ArgumentParser(add_help=False) + start_args.add_argument('--foreground', action='store_true', help='Run on foreground') + start_args.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') + start_args.add_argument('--no-server', dest='server', action='store_false', + help='Do not start the REST server') # Start - start = subparsers.add_parser('start', parents=[parent]) + start_parents = [ + logging_args, + worker_args, + server_args, + background_args, + start_args, + sql_args, + aws_args, + log_args + ] + start = subparsers.add_parser('start', parents=start_parents) start.set_defaults(action=_start) - _add_common_arguments(start) - start.add_argument('--cloud-mode', action='store_true', default=False, - help='Whether to run this worker in cloud mode') - start.add_argument('--no-save', dest='save_files', default=True, - action='store_const', const=False, - help="don't save models and metrics at all") - start.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') - - start.add_argument('--no-server', action='store_true', help='Do not start the REST server') - start.add_argument('--host', help='IP to listen at') - start.add_argument('--port', help='Port to listen at', type=int) - start.add_argument('--pid', help='PID file to use.', default='atm.pid') - start.add_argument('--debug', action='store_true', help='Start the server in debug mode.') + # _add_common_arguments(start) # Status - status = subparsers.add_parser('status', parents=[parent]) + status = subparsers.add_parser('status', parents=[logging_args, background_args]) status.set_defaults(action=_status) - status.add_argument('--pid', help='PID file to use.', default='atm.pid') - # restart - restart = subparsers.add_parser('restart', parents=[parent]) - restart.set_defaults(action=_restart) - _add_common_arguments(restart) - restart.add_argument('--cloud-mode', action='store_true', default=False, - help='Whether to run this worker in cloud mode') - restart.add_argument('--no-save', dest='save_files', default=True, - action='store_const', const=False, - help="don't save models and metrics at all") - restart.add_argument('-w', '--workers', default=1, type=int, help='Number of workers') - restart.add_argument('--no-server', action='store_true', help='Do not start the REST server') - restart.add_argument('--host', help='IP to listen at') - restart.add_argument('--port', help='Port to listen at', type=int) - restart.add_argument('--pid', help='PID file to use.', default='atm.pid') - restart.add_argument('--debug', action='store_true', help='restart the server in debug mode.') - restart.add_argument('-t', '--timeout', default=5, type=int, - help='Seconds to wait before killing the process.') - restart.add_argument('-f', '--force', action='store_true', - help='Kill the process if it does not terminate gracefully.') + # Stop Args + stop_args = argparse.ArgumentParser(add_help=False) + stop_args.add_argument('-t', '--timeout', default=5, type=int, + help='Seconds to wait before killing the process.') + stop_args.add_argument('-f', '--force', action='store_true', + help='Kill the process if it does not terminate gracefully.') # Stop - stop = subparsers.add_parser('stop', parents=[parent]) + stop = subparsers.add_parser('stop', parents=[logging_args, stop_args, background_args]) stop.set_defaults(action=_stop) - stop.add_argument('--pid', help='PID file to use.', default='atm.pid') - stop.add_argument('-t', '--timeout', default=5, type=int, - help='Seconds to wait before killing the process.') - stop.add_argument('-f', '--force', action='store_true', - help='Kill the process if it does not terminate gracefully.') + + # restart + restart = subparsers.add_parser('restart', parents=start_parents + [stop_args]) + restart.set_defaults(action=_restart) + # _add_common_arguments(restart) # Make Config - make_config = subparsers.add_parser('make_config', parents=[parent]) + make_config = subparsers.add_parser('make_config', parents=[logging_args]) make_config.set_defaults(action=_make_config) return parser @@ -374,7 +353,7 @@ def _get_parser(): def _logging_setup(verbosity=1, logfile=None): logger = logging.getLogger() - log_level = (3 - verbosity) * 10 + log_level = (2 - verbosity) * 10 fmt = '%(asctime)s - %(process)d - %(levelname)s - %(module)s - %(message)s' formatter = logging.Formatter(fmt) logger.setLevel(log_level) diff --git a/atm/config.py b/atm/config.py index dc570d4..b36380c 100644 --- a/atm/config.py +++ b/atm/config.py @@ -1,19 +1,15 @@ from __future__ import absolute_import, unicode_literals -import logging +import argparse import os import re -import socket -import sys -from argparse import ArgumentError, ArgumentTypeError, RawTextHelpFormatter -from builtins import map, object, str +from builtins import object, str import yaml from atm.constants import ( - BUDGET_TYPES, CUSTOM_CLASS_REGEX, DATA_TEST_PATH, JSON_REGEX, LOG_LEVELS, METHODS, METRICS, - SCORE_TARGETS, SELECTORS, SQL_DIALECTS, TIME_FMT, TUNERS) -from atm.utilities import ensure_directory + BUDGET_TYPES, CUSTOM_CLASS_REGEX, DATA_TEST_PATH, JSON_REGEX, METHODS, METRICS, SCORE_TARGETS, + SELECTORS, SQL_DIALECTS, TIME_FMT, TUNERS) class Config(object): @@ -30,170 +26,127 @@ class Config(object): Subclasses do not need to define __init__ or any other methods. """ - # list of all parameters which may be set on this config object - PARAMETERS = [] - # default values for all required parameters - DEFAULTS = {} + _PREFIX = None + _CONFIG = None + + @classmethod + def _add_prefix(cls, name): + if cls._PREFIX: + return '{}_{}'.format(cls._PREFIX, name) + else: + return name + + @classmethod + def _get_arg(cls, args, name): + arg_name = cls._add_prefix(name) + class_value = getattr(cls, name) + if isinstance(class_value, tuple): + default = class_value[1] + else: + default = None + + return args.get(arg_name, default) + + def __init__(self, args, path=None): + if isinstance(args, argparse.Namespace): + args = vars(args) + + config_arg = self._CONFIG or self._PREFIX + if not path and config_arg: + path = args.get(config_arg + '_config') + + if path: + with open(path, 'r') as f: + args = yaml.load(f) + + for name, value in vars(self.__class__).items(): + if not name.startswith('_') and not callable(value): + setattr(self, name, self._get_arg(args, name)) + + @classmethod + def get_parser(cls): + parser = argparse.ArgumentParser(add_help=False) + + if cls._PREFIX: + parser.add_argument('--{}-config'.format(cls._PREFIX), + help='path to yaml {} config file'.format(cls._PREFIX)) + + for name, description in vars(cls).items(): + if not name.startswith('_') and not callable(description): + arg_name = '--' + cls._add_prefix(name).replace('_', '-') + + if isinstance(description, tuple): + if len(description) == 3: + description, default, choices = description + parser.add_argument(arg_name, help=description, + default=default, choices=choices) + else: + description, default = description + if default is False: + parser.add_argument(arg_name, help=description, + action='store_true') + + else: + parser.add_argument(arg_name, help=description, + default=default) + + return parser + + def to_dict(self): + return { + name: value + for name, value in vars(self).items() + if not name.startswith('_') and not callable(value) + } + + def __repr__(self): + return '{}({})'.format(self.__class__.__name__, self.to_dict()) - def __init__(self, **kwargs): - for key in self.PARAMETERS: - value = kwargs.get(key) - # Here, if a keyword argument is set to None, it will be overridden - # by the default value. AFAIK, this is the only way to deal with - # keyword args passed in from argparse that weren't set on the - # command line. That means you shouldn't define any PARAMETERS for - # which None is a meaningful value; if you do, make sure None is - # also the default. - if key in self.DEFAULTS and value is None: - value = self.DEFAULTS[key] +class AWSConfig(Config): + """ Stores configuration for AWS S3 connections """ + _PREFIX = 'aws' - setattr(self, key, value) + access_key = 'AWS access key' + secret_key = 'AWS secret key' + s3_bucket = 'AWS S3 bucket to store data' + s3_folder = 'Folder in AWS S3 bucket in which to store data' -class AWSConfig(Config): - """ Stores configuration for AWS S3 and EC2 connections """ - PARAMETERS = [ - # universal config - 'access_key', - 'secret_key', - - # s3 config - 's3_bucket', - 's3_folder', - - # ec2 config - 'ec2_region', - 'ec2_amis', - 'ec2_key_pair', - 'ec2_keyfile', - 'ec2_instance_type', - 'ec2_username', - 'num_instances', - 'num_workers_per_instance' - ] - - DEFAULTS = {} +class DatasetConfig(Config): + """ Stores configuration of a Dataset """ + _CONFIG = 'run' + + train_path = ('Path to raw training data', os.path.join(DATA_TEST_PATH, 'pollution_1.csv')) + test_path = 'Path to raw test data (if applicable)' + data_description = 'Description of dataset' + class_column = ('Name of the class column in the input data', 'class') class SQLConfig(Config): """ Stores configuration for SQL database setup & connection """ - PARAMETERS = [ - 'dialect', - 'database', - 'username', - 'password', - 'host', - 'port', - 'query' - ] - - DEFAULTS = { - 'dialect': 'sqlite', - 'database': 'atm.db', - } + _PREFIX = 'sql' - -class RunConfig(Config): - """ Stores configuration for Dataset and Datarun setup """ - PARAMETERS = [ - # dataset config - 'train_path', - 'test_path', - 'data_description', - 'class_column', - - # datarun config - 'dataset_id', - 'methods', - 'priority', - 'budget_type', - 'budget', - 'deadline', - 'tuner', - 'r_minimum', - 'gridding', - 'selector', - 'k_window', - 'metric', - 'score_target' - ] - - DEFAULTS = { - 'train_path': os.path.join(DATA_TEST_PATH, 'pollution_1.csv'), - 'class_column': 'class', - 'methods': ['logreg', 'dt', 'knn'], - 'priority': 1, - 'budget_type': 'classifier', - 'budget': 100, - 'tuner': 'uniform', - 'selector': 'uniform', - 'r_minimum': 2, - 'k_window': 3, - 'gridding': 0, - 'metric': 'f1', - 'score_target': 'cv', - } + dialect = ('Dialect of SQL to use', 'sqlite', SQL_DIALECTS) + database = ('Name of, or path to, SQL database', 'atm.db') + username = 'Username for SQL database' + password = 'Password for SQL database' + host = 'Hostname for database machine' + port = 'Port used to connect to database' + query = 'Specify extra login details' class LogConfig(Config): - PARAMETERS = [ - 'log_level_stdout', - 'log_level_file', - 'log_dir', - 'model_dir', - 'metric_dir', - 'verbose_metrics', - ] - - DEFAULTS = { - 'log_level_stdout': 'ERROR', - 'log_level_file': 'INFO', - 'log_dir': 'logs', - 'model_dir': 'models', - 'metric_dir': 'metrics', - 'verbose_metrics': False, - } - - -def initialize_logging(config): - file_level = LOG_LEVELS.get(config.log_level_file.upper(), - logging.CRITICAL) - stdout_level = LOG_LEVELS.get(config.log_level_stdout.upper(), - logging.CRITICAL) - - handlers = [] - if file_level > logging.NOTSET: - fmt = '%(asctime)-15s %(name)s - %(levelname)s %(message)s' - ensure_directory(config.log_dir) - path = os.path.join(config.log_dir, socket.gethostname() + '.txt') - handler = logging.FileHandler(path) - handler.setFormatter(logging.Formatter(fmt)) - handler.setLevel(file_level) - handlers.append(handler) - - if stdout_level > logging.NOTSET: - fmt = '%(message)s' - handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(logging.Formatter(fmt)) - handler.setLevel(stdout_level) - handlers.append(handler) - - if not len(handlers): - handlers.append(logging.NullHandler()) - - for lib in ['atm', 'btb']: - logger = logging.getLogger(lib) - logger.setLevel(min(file_level, stdout_level)) - - for h in logger.handlers: - logger.removeHandler(h) - - for h in handlers: - logger.addHandler(h) - - logger.propagate = False - logger.debug('Logging is active for module %s.' % lib) + # log_level_stdout = ('minimum log level to write to stdout', 'ERROR') + # log_level_file =('minimum log level to write to the log file', 'INFO') + # log_dir = ('Directory where logs will be saved', 'logs') + model_dir = ('Directory where computed models will be saved', 'models') + metric_dir = ('Directory where model metrics will be saved', 'metrics') + verbose_metrics = ( + 'If set, compute full ROC and PR curves and ' + 'per-label metrics for each classifier', + False + ) def option_or_path(options, regex=CUSTOM_CLASS_REGEX): @@ -209,344 +162,193 @@ def type_check(s): return s # if both of those fail, there's something wrong - raise ArgumentTypeError('%s is not a valid option or path!' % s) + raise argparse.ArgumentTypeError('%s is not a valid option or path!' % s) return type_check -def add_arguments_logging(parser): - """ - Add all argparse arguments needed to parse logging configuration from the - command line. - parser: an argparse.ArgumentParser object - """ - # Config file path - parser.add_argument('--log-config', help='path to yaml logging config file') - - # paths to saved files - parser.add_argument('--model-dir', - help='Directory where computed models will be saved') - parser.add_argument('--metric-dir', - help='Directory where model metrics will be saved') - parser.add_argument('--log-dir', - help='Directory where logs will be saved') - - # hoe much information to log or save - parser.add_argument('--verbose-metrics', action='store_true', - help='If set, compute full ROC and PR curves and ' - 'per-label metrics for each classifier') - - log_levels = list(map(str.lower, list(LOG_LEVELS.keys()))) - parser.add_argument('--log-level-file', choices=log_levels, - help='minimum log level to write to the log file') - # if this is being called from the command line, print more information to - # stdout by default - parser.add_argument('--log-level-stdout', choices=log_levels, - help='minimum log level to write to stdout') - - return parser - - -def add_arguments_aws_s3(parser): - """ - Add all argparse arguments needed to parse AWS S3 configuration from the - command line. This is separate from aws_ec2 because usually only one set of - arguments or the other is needed. - parser: an argparse.ArgumentParser object - """ - # Config file - parser.add_argument('--aws-config', help='path to yaml AWS config file') - - # All of these arguments must start with --aws-, and must correspond to - # keys present in the AWS config example file. - # AWS API access key pair - # try... catch because this might be called after aws_s3 - try: - parser.add_argument('--aws-access-key', help='AWS access key') - parser.add_argument('--aws-secret-key', help='AWS secret key') - except ArgumentError: - pass - - # S3-specific arguments - parser.add_argument('--aws-s3-bucket', help='AWS S3 bucket to store data') - parser.add_argument('--aws-s3-folder', help='Folder in AWS S3 bucket in which to store data') - - return parser - - -def add_arguments_aws_ec2(parser): - """ - Add all argparse arguments needed to parse AWS EC2 configuration from the - command line. This is separate from aws_s3 because usually only one set of - arguments or the other is needed. - parser: an argparse.ArgumentParser object - """ - # Config file - parser.add_argument('--aws-config', help='path to yaml AWS config file') - - # All of these arguments must start with --aws-, and must correspond to - # keys present in the AWS config example file. - # AWS API access key pair - # try... catch because this might be called after aws_s3 - try: - parser.add_argument('--aws-access-key', help='AWS access key') - parser.add_argument('--aws-secret-key', help='AWS secret key') - except ArgumentError: - pass - - # AWS EC2 configurations - parser.add_argument('--num-instances', help='Number of EC2 instances to start') - parser.add_argument('--num-workers-per-instance', help='Number of ATM workers per instances') - parser.add_argument('--ec2-region', help='Region to start instances in') - parser.add_argument('--ec2-ami', help='Name of ATM AMI') - parser.add_argument('--ec2-key-pair', help='AWS key pair to use for EC2 instances') - parser.add_argument('--ec2-keyfile', help='Local path to key file (must match ec2-key-pair)') - parser.add_argument('--ec2-instance-type', help='Type of EC2 instance to start') - parser.add_argument('--ec2-username', help='Username to log into EC2 instance') - - return parser - - -def add_arguments_sql(parser): - """ - Add all argparse arguments needed to parse configuration for the ModelHub - SQL database from the command line. - - parser: an argparse.ArgumentParser object - """ - # Config file - parser.add_argument('--sql-config', help='path to yaml SQL config file') - - # All of these arguments must start with --sql-, and must correspond to - # keys present in the SQL config example file. - parser.add_argument('--sql-dialect', choices=SQL_DIALECTS, default='sqlite', - help='Dialect of SQL to use') - parser.add_argument('--sql-database', default='atm.db', - help='Name of, or path to, SQL database') - parser.add_argument('--sql-username', help='Username for SQL database') - parser.add_argument('--sql-password', help='Password for SQL database') - parser.add_argument('--sql-host', help='Hostname for database machine') - parser.add_argument('--sql-port', help='Port used to connect to database') - parser.add_argument('--sql-query', help='Specify extra login details') - - return parser - - -def add_arguments_datarun(parser): - """ - Add all argparse arguments needed to parse dataset and datarun configuration - from the command line. - - parser: an argparse.ArgumentParser object - """ - # make sure the text for these arguments is formatted correctly - # this allows newlines in the help strings - parser.formatter_class = RawTextHelpFormatter - - # Config file - parser.add_argument('--run-config', help='path to yaml datarun config file') - - # Dataset Arguments ##################################################### - # ########################################################################## - parser.add_argument('--dataset-id', type=int, - help="ID of dataset, if it's already in the database") - - # These are only relevant if dataset_id is not provided - parser.add_argument('--train-path', help='Path to raw training data') - parser.add_argument('--test-path', help='Path to raw test data (if applicable)') - parser.add_argument('--data-description', help='Description of dataset') - parser.add_argument('--class-column', help='Name of the class column in the input data') - - # Datarun Arguments ##################################################### - # ########################################################################## - # Notes: - # - Support vector machines (svm) can take a long time to train. It's not an - # error, it's just part of what happens when the method happens to explore - # a crappy set of parameters on a powerful algo like this. - # - Stochastic gradient descent (sgd) can sometimes fail on certain - # parameter settings as well. Don't worry, they train SUPER fast, and the - # worker.py will simply log the error and continue. - # - # Method options: - # logreg - logistic regression - # svm - support vector machine - # sgd - linear classifier with stochastic gradient descent - # dt - decision tree - # et - extra trees - # rf - random forest - # gnb - gaussian naive bayes - # mnb - multinomial naive bayes - # bnb - bernoulli naive bayes - # gp - gaussian process - # pa - passive aggressive - # knn - K nearest neighbors - # mlp - multi-layer perceptron - parser.add_argument('--methods', nargs='+', - type=option_or_path(METHODS, JSON_REGEX), - help='Method or list of methods to use for ' - 'classification. Each method can either be one of the ' - 'pre-defined method codes listed below or a path to a ' - 'JSON file defining a custom method.' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS)) - parser.add_argument('--priority', type=int, - help='Priority of the datarun (higher = more important') - parser.add_argument('--budget-type', choices=BUDGET_TYPES, - help='Type of budget to use') - parser.add_argument('--budget', type=int, - help='Value of the budget, either in classifiers or minutes') - parser.add_argument('--deadline', - help='Deadline for datarun completion. If provided, this ' - 'overrides the configured walltime budget.\nFormat: {}'.format( - TIME_FMT.replace('%', '%%'))) - - # Which field to use to judge performance, for the sake of AutoML - # options: - # f1 - F1 score (harmonic mean of precision and recall) - # roc_auc - area under the Receiver Operating Characteristic curve - # accuracy - percent correct - # cohen_kappa - measures accuracy, but controls for chance of guessing - # correctly - # rank_accuracy - multiclass only: percent of examples for which the true - # label is in the top 1/3 most likely predicted labels - # ap - average precision: nearly identical to area under - # precision/recall curve. - # mcc - matthews correlation coefficient: good for unbalanced classes - # - # f1 and roc_auc may be appended with _micro or _macro to use with - # multiclass problems. - parser.add_argument('--metric', choices=METRICS, - help='Metric by which ATM should evaluate classifiers. ' - 'The metric function specified here will be used to ' - 'compute the "judgment metric" for each classifier.') - - # Which data to use for computing judgment score - # cv - cross-validated performance on training data - # test - performance on test data - # mu_sigma - lower confidence bound on cv score - parser.add_argument('--score-target', choices=SCORE_TARGETS, - help='Determines which judgment metric will be used to ' - 'search the hyperparameter space. "cv" will use the mean ' - 'cross-validated performance, "test" will use the ' - 'performance on a test dataset, and "mu_sigma" will use ' - 'the lower confidence bound on the CV performance.') - - # AutoML Arguments ###################################################### - # ########################################################################## - # hyperparameter selection strategy - # How should ATM sample hyperparameters from a given hyperpartition? - # uniform - pick randomly! (baseline) - # gp - vanilla Gaussian Process - # gp_ei - Gaussian Process expected improvement criterion - # gp_eivel - Gaussian Process expected improvement, with randomness added - # in based on velocity of improvement - # path to custom tuner, defined in python - parser.add_argument('--tuner', type=option_or_path(TUNERS), - help='Type of BTB tuner to use. Can either be one of ' - 'the pre-configured tuners listed below or a path to a ' - 'custom tuner in the form "/path/to/tuner.py:ClassName".' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS)) - - # How should ATM select a particular hyperpartition from the set of all - # possible hyperpartitions? - # Options: - # uniform - pick randomly - # ucb1 - UCB1 multi-armed bandit - # bestk - MAB using only the best K runs in each hyperpartition - # bestkvel - MAB with velocity of best K runs - # purebestkvel - always return hyperpartition with highest velocity - # recentk - MAB with most recent K runs - # recentkvel - MAB with velocity of most recent K runs - # hieralg - hierarchical MAB: choose a classifier first, then choose - # a partition - # path to custom selector, defined in python - parser.add_argument('--selector', type=option_or_path(SELECTORS), - help='Type of BTB selector to use. Can either be one of ' - 'the pre-configured selectors listed below or a path to a ' - 'custom tuner in the form "/path/to/selector.py:ClassName".' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS)) - - # r_minimum is the number of random runs performed in each hyperpartition before - # allowing bayesian opt to select parameters. Consult the thesis to - # understand what those mean, but essentially: - # - # if (num_classifiers_trained_in_hyperpartition >= r_minimum) - # # train using sample criteria - # else - # # train using uniform (baseline) - parser.add_argument('--r-minimum', type=int, - help='number of random runs to perform before tuning can occur') - - # k is number that xxx-k methods use. It is similar to r_minimum, except it is - # called k_window and determines how much "history" ATM considers for certain - # partition selection logics. - parser.add_argument('--k-window', type=int, - help='number of previous scores considered by -k selector methods') - - # gridding determines whether or not sample selection will happen on a grid. - # If any positive integer, a grid with `gridding` points on each axis is - # established, and hyperparameter vectors are sampled from this finite - # space. If 0 (or blank), hyperparameters are sampled from continuous - # space, and there is no limit to the number of hyperparameter vectors that - # may be tried. - parser.add_argument('--gridding', type=int, - help='gridding factor (0: no gridding)') - - return parser - - -def load_config(**kwargs): - """ - Load config objects from yaml files and command line arguments. Command line - args override yaml files where applicable. - - Args: - **kwargs: miscellaneous arguments specifying individual configuration - parameters. Any kwargs beginning with sql_ are SQL config - arguments, any beginning with aws_ are AWS config. - - Returns: run_conf, aws_conf, log_conf - """ - run_args = {} - aws_args = {} - log_args = {} - - # kwargs are most likely generated by argparse. - # Any unspecified argparse arguments will be None, so ignore those. We only - # care about arguments explicitly specified by the user. - kwargs = {k: v for k, v in list(kwargs.items()) if v is not None} - - # check the keyword args for config paths - run_path = kwargs.get('run_config') - aws_path = kwargs.get('aws_config') - log_path = kwargs.get('log_config') - - # load any yaml config files for which paths were provided - - if run_path: - with open(run_path) as f: - run_args = yaml.load(f) - - if aws_path: - with open(aws_path) as f: - aws_args = yaml.load(f) - - if log_path: - with open(log_path) as f: - log_args = yaml.load(f) - - # Use keyword args to override yaml config values - aws_args.update({k.replace('aws_', ''): v for k, v in list(kwargs.items()) - if 'aws_' in k}) - run_args.update({k: v for k, v in list(kwargs.items()) if k in - RunConfig.PARAMETERS}) - log_args.update({k: v for k, v in list(kwargs.items()) if k in - LogConfig.PARAMETERS}) - - # It's ok if there are some extra arguments that get passed in here; only - # kwargs that correspond to real config values will be stored on the config - # objects. - aws_conf = AWSConfig(**aws_args) - run_conf = RunConfig(**run_args) - log_conf = LogConfig(**log_args) - - return run_conf, aws_conf, log_conf +class RunConfig(Config): + """ Stores configuration for Dataset and Datarun setup """ + _CONFIG = 'run' + + # dataset config + # train_path = None + # test_path = None + # data_description = None + # class_column = None + + # datarun config + dataset_id = None + methods = None + priority = None + budget_type = None + budget = None + deadline = None + tuner = None + r_minimum = None + gridding = None + selector = None + k_window = None + metric = None + score_target = None + + @classmethod + def get_parser(cls): + parser = argparse.ArgumentParser(add_help=False) + + # make sure the text for these arguments is formatted correctly + # this allows newlines in the help strings + parser.formatter_class = argparse.RawTextHelpFormatter + + # Config file + parser.add_argument('--run-config', help='path to yaml datarun config file') + + # Dataset Arguments ##################################################### + # ########################################################################## + parser.add_argument('--dataset-id', type=int, + help="ID of dataset, if it's already in the database") + + # These are only relevant if dataset_id is not provided + # parser.add_argument('--train-path', help='Path to raw training data', + # default=os.path.join(DATA_TEST_PATH, 'pollution_1.csv')) + # parser.add_argument('--test-path', help='Path to raw test data (if applicable)') + # parser.add_argument('--data-description', help='Description of dataset') + # parser.add_argument('--class-column', default='class', + # help='Name of the class column in the input data') + + # Datarun Arguments ##################################################### + # ########################################################################## + # Notes: + # - Support vector machines (svm) can take a long time to train. It's not an + # error, it's just part of what happens when the method happens to explore + # a crappy set of parameters on a powerful algo like this. + # - Stochastic gradient descent (sgd) can sometimes fail on certain + # parameter settings as well. Don't worry, they train SUPER fast, and the + # worker.py will simply log the error and continue. + # + # Method options: + # logreg - logistic regression + # svm - support vector machine + # sgd - linear classifier with stochastic gradient descent + # dt - decision tree + # et - extra trees + # rf - random forest + # gnb - gaussian naive bayes + # mnb - multinomial naive bayes + # bnb - bernoulli naive bayes + # gp - gaussian process + # pa - passive aggressive + # knn - K nearest neighbors + # mlp - multi-layer perceptron + parser.add_argument('--methods', nargs='+', + type=option_or_path(METHODS, JSON_REGEX), + default=['logreg', 'dt', 'knn'], + help='Method or list of methods to use for ' + 'classification. Each method can either be one of the ' + 'pre-defined method codes listed below or a path to a ' + 'JSON file defining a custom method.' + '\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS)) + parser.add_argument('--priority', type=int, default=1, + help='Priority of the datarun (higher = more important') + parser.add_argument('--budget-type', choices=BUDGET_TYPES, default='classifier', + help='Type of budget to use') + parser.add_argument('--budget', type=int, default=100, + help='Value of the budget, either in classifiers or minutes') + parser.add_argument('--deadline', + help='Deadline for datarun completion. If provided, this ' + 'overrides the configured walltime budget.\nFormat: {}'.format( + TIME_FMT.replace('%', '%%'))) + + # Which field to use to judge performance, for the sake of AutoML + # options: + # f1 - F1 score (harmonic mean of precision and recall) + # roc_auc - area under the Receiver Operating Characteristic curve + # accuracy - percent correct + # cohen_kappa - measures accuracy, but controls for chance of guessing + # correctly + # rank_accuracy - multiclass only: percent of examples for which the true + # label is in the top 1/3 most likely predicted labels + # ap - average precision: nearly identical to area under + # precision/recall curve. + # mcc - matthews correlation coefficient: good for unbalanced classes + # + # f1 and roc_auc may be appended with _micro or _macro to use with + # multiclass problems. + parser.add_argument('--metric', choices=METRICS, default='f1', + help='Metric by which ATM should evaluate classifiers. ' + 'The metric function specified here will be used to ' + 'compute the "judgment metric" for each classifier.') + + # Which data to use for computing judgment score + # cv - cross-validated performance on training data + # test - performance on test data + # mu_sigma - lower confidence bound on cv score + parser.add_argument('--score-target', choices=SCORE_TARGETS, default='cv', + help='Determines which judgment metric will be used to ' + 'search the hyperparameter space. "cv" will use the mean ' + 'cross-validated performance, "test" will use the ' + 'performance on a test dataset, and "mu_sigma" will use ' + 'the lower confidence bound on the CV performance.') + + # AutoML Arguments ###################################################### + # ########################################################################## + # hyperparameter selection strategy + # How should ATM sample hyperparameters from a given hyperpartition? + # uniform - pick randomly! (baseline) + # gp - vanilla Gaussian Process + # gp_ei - Gaussian Process expected improvement criterion + # gp_eivel - Gaussian Process expected improvement, with randomness added + # in based on velocity of improvement + # path to custom tuner, defined in python + parser.add_argument('--tuner', type=option_or_path(TUNERS), default='uniform', + help='Type of BTB tuner to use. Can either be one of ' + 'the pre-configured tuners listed below or a path to a ' + 'custom tuner in the form "/path/to/tuner.py:ClassName".' + '\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS)) + + # How should ATM select a particular hyperpartition from the set of all + # possible hyperpartitions? + # Options: + # uniform - pick randomly + # ucb1 - UCB1 multi-armed bandit + # bestk - MAB using only the best K runs in each hyperpartition + # bestkvel - MAB with velocity of best K runs + # purebestkvel - always return hyperpartition with highest velocity + # recentk - MAB with most recent K runs + # recentkvel - MAB with velocity of most recent K runs + # hieralg - hierarchical MAB: choose a classifier first, then choose + # a partition + # path to custom selector, defined in python + parser.add_argument('--selector', type=option_or_path(SELECTORS), default='uniform', + help='Type of BTB selector to use. Can either be one of ' + 'the pre-configured selectors listed below or a path to a ' + 'custom tuner in the form "/path/to/selector.py:ClassName".' + '\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS)) + + # r_minimum is the number of random runs performed in each hyperpartition before + # allowing bayesian opt to select parameters. Consult the thesis to + # understand what those mean, but essentially: + # + # if (num_classifiers_trained_in_hyperpartition >= r_minimum) + # # train using sample criteria + # else + # # train using uniform (baseline) + parser.add_argument('--r-minimum', type=int, default=2, + help='number of random runs to perform before tuning can occur') + + # k is number that xxx-k methods use. It is similar to r_minimum, except it is + # called k_window and determines how much "history" ATM considers for certain + # partition selection logics. + parser.add_argument('--k-window', type=int, default=3, + help='number of previous scores considered by -k selector methods') + + # gridding determines whether or not sample selection will happen on a grid. + # If any positive integer, a grid with `gridding` points on each axis is + # established, and hyperparameter vectors are sampled from this finite + # space. If 0 (or blank), hyperparameters are sampled from continuous + # space, and there is no limit to the number of hyperparameter vectors that + # may be tried. + parser.add_argument('--gridding', type=int, default=0, + help='gridding factor (0: no gridding)') + + return parser diff --git a/atm/models.py b/atm/core.py similarity index 69% rename from atm/models.py rename to atm/core.py index 69f792d..97059c9 100644 --- a/atm/models.py +++ b/atm/core.py @@ -1,3 +1,11 @@ +# -*- coding: utf-8 -*- + +"""Core ATM module. + +This module contains the ATM class, which is the one responsible for +executing and orchestrating the main ATM functionalities. +""" + from __future__ import absolute_import, division, unicode_literals import logging @@ -8,9 +16,8 @@ from datetime import datetime, timedelta from operator import attrgetter -from past.utils import old_div - from atm.constants import TIME_FMT, PartitionStatus +from atm.database import Database from atm.encoder import MetaData from atm.method import Method from atm.utilities import download_data, get_public_ip @@ -20,15 +27,11 @@ class ATM(object): - """ - Thiss class is code API instance that allows you to use ATM in your python code. - """ LOOP_WAIT = 1 - def __init__(self, db, run_conf, aws_conf, log_conf): - self.db = db - self.run_conf = run_conf + def __init__(self, sql_conf, aws_conf, log_conf): + self.db = Database(**sql_conf.to_dict()) self.aws_conf = aws_conf self.log_conf = log_conf @@ -62,13 +65,13 @@ def work(self, datarun_ids=None, save_files=False, choose_randomly=True, dataruns = self.db.get_dataruns(include_ids=datarun_ids, ignore_complete=True) if not dataruns: if wait: - LOGGER.warning('No dataruns found. Sleeping %d seconds and trying again.', - ATM.LOOP_WAIT) + LOGGER.debug('No dataruns found. Sleeping %d seconds and trying again.', + ATM.LOOP_WAIT) time.sleep(ATM.LOOP_WAIT) continue else: - LOGGER.warning('No dataruns found. Exiting.') + LOGGER.info('No dataruns found. Exiting.') break max_priority = max([datarun.priority for datarun in dataruns]) @@ -94,21 +97,21 @@ def work(self, datarun_ids=None, save_files=False, choose_randomly=True, except ClassifierError: # the exception has already been handled; just wait a sec so we # don't go out of control reporting errors - LOGGER.warning('Something went wrong. Sleeping %d seconds.', ATM.LOOP_WAIT) + LOGGER.error('Something went wrong. Sleeping %d seconds.', ATM.LOOP_WAIT) time.sleep(ATM.LOOP_WAIT) elapsed_time = (datetime.now() - start_time).total_seconds() if total_time is not None and elapsed_time >= total_time: - LOGGER.warning('Total run time for worker exceeded; exiting.') + LOGGER.info('Total run time for worker exceeded; exiting.') break - def create_dataset(self): + def create_dataset(self, dataset_conf): """ Create a dataset and add it to the ModelHub database. """ # download data to the local filesystem to extract metadata - train_local, test_local = download_data(self.run_conf.train_path, - self.run_conf.test_path, + train_local, test_local = download_data(dataset_conf.train_path, + dataset_conf.test_path, self.aws_conf) # create the name of the dataset from the path to the data @@ -116,22 +119,22 @@ def create_dataset(self): name = name.replace("_train.csv", "").replace(".csv", "") # process the data into the form ATM needs and save it to disk - meta = MetaData(self.run_conf.class_column, train_local, test_local) + meta = MetaData(dataset_conf.class_column, train_local, test_local) # enter dataset into database dataset = self.db.create_dataset(name=name, - description=self.run_conf.data_description, - train_path=self.run_conf.train_path, - test_path=self.run_conf.test_path, - class_column=self.run_conf.class_column, + description=dataset_conf.data_description, + train_path=dataset_conf.train_path, + test_path=dataset_conf.test_path, + class_column=dataset_conf.class_column, n_examples=meta.n_examples, k_classes=meta.k_classes, d_features=meta.d_features, majority=meta.majority, - size_kb=old_div(meta.size, 1000)) + size_kb=meta.size) return dataset - def create_datarun(self, dataset): + def create_datarun(self, dataset, run_conf): """ Given a config, creates a set of dataruns for the config and enters them into the database. Returns the ID of the created datarun. @@ -139,52 +142,44 @@ def create_datarun(self, dataset): dataset: Dataset SQLAlchemy ORM object """ # describe the datarun by its tuner and selector - run_description = '__'.join([self.run_conf.tuner, self.run_conf.selector]) + run_description = '__'.join([run_conf.tuner, run_conf.selector]) # set the deadline, if applicable - deadline = self.run_conf.deadline + deadline = run_conf.deadline if deadline: deadline = datetime.strptime(deadline, TIME_FMT) # this overrides the otherwise configured budget_type # TODO: why not walltime and classifiers budget simultaneously? - self.run_conf.budget_type = 'walltime' - elif self.run_conf.budget_type == 'walltime': - deadline = datetime.now() + timedelta(minutes=self.run_conf.budget) + run_conf.budget_type = 'walltime' + elif run_conf.budget_type == 'walltime': + deadline = datetime.now() + timedelta(minutes=run_conf.budget) - target = self.run_conf.score_target + '_judgment_metric' + target = run_conf.score_target + '_judgment_metric' datarun = self.db.create_datarun(dataset_id=dataset.id, description=run_description, - tuner=self.run_conf.tuner, - selector=self.run_conf.selector, - gridding=self.run_conf.gridding, - priority=self.run_conf.priority, - budget_type=self.run_conf.budget_type, - budget=self.run_conf.budget, + tuner=run_conf.tuner, + selector=run_conf.selector, + gridding=run_conf.gridding, + priority=run_conf.priority, + budget_type=run_conf.budget_type, + budget=run_conf.budget, deadline=deadline, - metric=self.run_conf.metric, + metric=run_conf.metric, score_target=target, - k_window=self.run_conf.k_window, - r_minimum=self.run_conf.r_minimum) + k_window=run_conf.k_window, + r_minimum=run_conf.r_minimum) return datarun - def enter_data(self, run_per_partition=False): + def create_dataruns(self, run_conf, run_per_partition=False): """ Generate a datarun, including a dataset if necessary. Returns: ID of the generated datarun """ - # connect to the database - - # if the user has provided a dataset id, use that. Otherwise, create a new - # dataset based on the arguments we were passed. - if self.run_conf.dataset_id is None: - dataset = self.create_dataset() - self.run_conf.dataset_id = dataset.id - else: - dataset = self.db.get_dataset(self.run_conf.dataset_id) + dataset = self.db.get_dataset(run_conf.dataset_id) method_parts = {} - for m in self.run_conf.methods: + for m in run_conf.methods: # enumerate all combinations of categorical variables for this method method = Method(m) method_parts[m] = method.get_hyperpartitions() @@ -192,10 +187,11 @@ def enter_data(self, run_per_partition=False): (m, len(method_parts[m]))) # create hyperpartitions and datarun(s) - run_ids = [] + dataruns = [] if not run_per_partition: LOGGER.debug('saving datarun...') - datarun = self.create_datarun(dataset) + datarun = self.create_datarun(dataset, run_conf) + dataruns.append(datarun) LOGGER.debug('saving hyperpartions...') for method, parts in list(method_parts.items()): @@ -203,8 +199,8 @@ def enter_data(self, run_per_partition=False): # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. if run_per_partition: - datarun = self.create_datarun(dataset) - run_ids.append(datarun.id) + datarun = self.create_datarun(dataset, run_conf) + dataruns.append(datarun) # create a new hyperpartition in the database self.db.create_hyperpartition(datarun_id=datarun.id, @@ -214,13 +210,30 @@ def enter_data(self, run_per_partition=False): categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) + return dataruns + + def enter_data(self, dataset_conf, run_conf, run_per_partition=False): + """ + Generate a datarun, including a dataset if necessary. + + Returns: ID of the generated datarun + """ + # if the user has provided a dataset id, use that. Otherwise, create a new + # dataset based on the arguments we were passed. + if run_conf.dataset_id is None: + dataset = self.create_dataset(dataset_conf) + run_conf.dataset_id = dataset.id + + dataruns = self.create_dataruns(run_conf, run_per_partition) + LOGGER.info('Data entry complete. Summary:') LOGGER.info('\tDataset ID: %d', dataset.id) LOGGER.info('\tTraining data: %s', dataset.train_path) LOGGER.info('\tTest data: %s', (dataset.test_path or 'None')) + datarun = dataruns[0] if run_per_partition: - LOGGER.info('\tDatarun IDs: %s', ', '.join(map(str, run_ids))) + LOGGER.info('\tDatarun IDs: %s', ', '.join(str(datarun.id) for datarun in dataruns)) else: LOGGER.info('\tDatarun ID: %d', datarun.id) @@ -228,5 +241,3 @@ def enter_data(self, run_per_partition=False): LOGGER.info('\tHyperpartition selection strategy: %s', datarun.selector) LOGGER.info('\tParameter tuning strategy: %s', datarun.tuner) LOGGER.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) - - return run_ids or datarun.id diff --git a/atm/encoder.py b/atm/encoder.py index 2e4b714..b4ec13a 100644 --- a/atm/encoder.py +++ b/atm/encoder.py @@ -4,7 +4,6 @@ import numpy as np import pandas as pd -from past.utils import old_div from sklearn.preprocessing import LabelEncoder, OneHotEncoder @@ -26,13 +25,14 @@ def __init__(self, class_column, train_path, test_path=None): for c in data.columns: if data[c].dtype == 'object': total_features += len(np.unique(data[c])) - 1 - majority_percentage = old_div(float(max(counts)), float(sum(counts))) + + majority_percentage = float(max(counts)) / float(sum(counts)) self.n_examples = data.shape[0] self.d_features = total_features self.k_classes = len(np.unique(data[class_column])) self.majority = majority_percentage - self.size = np.array(data).nbytes + self.size = int(np.array(data).nbytes / 1000) class DataEncoder(object): diff --git a/atm/metrics.py b/atm/metrics.py index e06e0da..039587e 100644 --- a/atm/metrics.py +++ b/atm/metrics.py @@ -4,7 +4,6 @@ import numpy as np import pandas as pd -from past.utils import old_div from sklearn.metrics import ( accuracy_score, average_precision_score, cohen_kappa_score, f1_score, matthews_corrcoef, precision_recall_curve, roc_auc_score, roc_curve) @@ -37,7 +36,7 @@ def rank_n_accuracy(y_true, y_prob_mat, n=0.33): if y_true[i] in rankings[i, :]: correct_sample_count += 1 - return old_div(correct_sample_count, num_samples) + return int(correct_sample_count / num_samples) def get_per_class_matrix(y, classes=None): diff --git a/atm/worker.py b/atm/worker.py index bd44003..36bc971 100644 --- a/atm/worker.py +++ b/atm/worker.py @@ -288,6 +288,7 @@ def save_classifier_cloud(self, local_model_path, local_metric_path): local_model_path: path to serialized model in the local file system local_metric_path: path to serialized metrics in the local file system """ + # TODO: This does not work conn = S3Connection(self.aws_config.access_key, self.aws_config.secret_key) bucket = conn.get_bucket(self.aws_config.s3_bucket) From 3caf3753a3bc28ab39883a34379564ec5e2585b3 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 12:22:43 +0200 Subject: [PATCH 30/44] Cleanup and improve config management --- atm/api/__init__.py | 6 +- atm/cli.py | 38 +--- atm/config.py | 422 ++++++++++++++++++++++---------------------- atm/core.py | 44 ++--- 4 files changed, 244 insertions(+), 266 deletions(-) diff --git a/atm/api/__init__.py b/atm/api/__init__.py index 581ac64..3380741 100644 --- a/atm/api/__init__.py +++ b/atm/api/__init__.py @@ -15,9 +15,10 @@ def make_absolute(url): return url -def create_app(atm): +def create_app(atm, debug=False): db = atm.db app = Flask(__name__) + app.config['DEBUG'] = debug app.config['SQLALCHEMY_DATABASE_URI'] = make_absolute(db.engine.url) # Create the Flask-Restless API manager. @@ -29,10 +30,9 @@ def atm_run(): abort(400) data = request.json - run_per_partition = data.get('run_per_partition', False) run_conf = RunConfig(data) - dataruns = atm.create_dataruns(run_conf, run_per_partition) + dataruns = atm.create_dataruns(run_conf) response = { 'status': 'OK', diff --git a/atm/cli.py b/atm/cli.py index 1df9ee4..69735e5 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -13,28 +13,13 @@ from lockfile.pidlockfile import PIDLockFile from atm.api import create_app -# from atm.config import ( -# add_arguments_aws_s3, add_arguments_datarun, add_arguments_logging, add_arguments_sql, -# load_config) from atm.config import AWSConfig, DatasetConfig, LogConfig, RunConfig, SQLConfig from atm.core import ATM LOGGER = logging.getLogger(__name__) -# def _get_db(args): -# """Returns an instance of Database with the given args.""" -# db_args = { -# k[4:]: v -# for k, v in vars(args).items() -# if k.startswith('sql_') and v is not None -# } -# return Database(**db_args) - - def _get_atm(args): - # db = _get_db(args) - # run_conf, aws_conf, log_conf = load_config(**vars(args)) sql_conf = SQLConfig(args) aws_conf = AWSConfig(args) log_conf = LogConfig(args) @@ -57,9 +42,8 @@ def _work(args, wait=False): def _serve(args): """Launch the ATM API with the given host / port.""" - # db = _get_db(args) atm = _get_atm(args) - app = create_app(atm) + app = create_app(atm, getattr(args, 'debug', False)) app.run(host=args.host, port=args.port) @@ -210,7 +194,7 @@ def _enter_data(args): atm = _get_atm(args) run_conf = RunConfig(args) dataset_conf = DatasetConfig(args) - atm.enter_data(dataset_conf, run_conf, args.run_per_partition) + atm.enter_data(dataset_conf, run_conf) def _make_config(args): @@ -226,13 +210,6 @@ def _make_config(args): shutil.copy(template, target_file) -# load other functions from config.py -# def _add_common_arguments(parser): -# add_arguments_sql(parser) -# add_arguments_aws_s3(parser) -# add_arguments_logging(parser) - - def _get_parser(): logging_args = argparse.ArgumentParser(add_help=False) logging_args.add_argument('-v', '--verbose', action='count', default=0) @@ -261,17 +238,12 @@ def _get_parser(): ] enter_data = subparsers.add_parser('enter_data', parents=enter_data_parents) enter_data.set_defaults(action=_enter_data) - # _add_common_arguments(enter_data) - # add_arguments_datarun(enter_data) - enter_data.add_argument('--run-per-partition', default=False, action='store_true', - help='if set, generate a new datarun for each hyperpartition') # Wroker Args worker_args = argparse.ArgumentParser(add_help=False) worker_args.add_argument('--cloud-mode', action='store_true', default=False, help='Whether to run this worker in cloud mode') - worker_args.add_argument('--no-save', dest='save_files', default=True, - action='store_const', const=False, + worker_args.add_argument('--no-save', dest='save_files', action='store_false', help="don't save models and metrics at all") # Worker @@ -284,7 +256,6 @@ def _get_parser(): ] worker = subparsers.add_parser('worker', parents=worker_parents) worker.set_defaults(action=_work) - # _add_common_arguments(worker) worker.add_argument('--dataruns', help='Only train on dataruns with these ids', nargs='+') worker.add_argument('--total-time', help='Number of seconds to run worker', type=int) @@ -296,6 +267,7 @@ def _get_parser(): # Server server = subparsers.add_parser('server', parents=[logging_args, server_args, sql_args]) server.set_defaults(action=_serve) + server.add_argument('--debug', help='Start in debug mode', action='store_true') # add_arguments_sql(server) # Background Args @@ -322,7 +294,6 @@ def _get_parser(): ] start = subparsers.add_parser('start', parents=start_parents) start.set_defaults(action=_start) - # _add_common_arguments(start) # Status status = subparsers.add_parser('status', parents=[logging_args, background_args]) @@ -342,7 +313,6 @@ def _get_parser(): # restart restart = subparsers.add_parser('restart', parents=start_parents + [stop_args]) restart.set_defaults(action=_restart) - # _add_common_arguments(restart) # Make Config make_config = subparsers.add_parser('make_config', parents=[logging_args]) diff --git a/atm/config.py b/atm/config.py index b36380c..c125932 100644 --- a/atm/config.py +++ b/atm/config.py @@ -40,7 +40,9 @@ def _add_prefix(cls, name): def _get_arg(cls, args, name): arg_name = cls._add_prefix(name) class_value = getattr(cls, name) - if isinstance(class_value, tuple): + if isinstance(class_value, dict): + default = class_value.get('default') + elif isinstance(class_value, tuple): default = class_value[1] else: default = None @@ -67,6 +69,10 @@ def __init__(self, args, path=None): def get_parser(cls): parser = argparse.ArgumentParser(add_help=False) + # make sure the text for these arguments is formatted correctly + # this allows newlines in the help strings + parser.formatter_class = argparse.RawTextHelpFormatter + if cls._PREFIX: parser.add_argument('--{}-config'.format(cls._PREFIX), help='path to yaml {} config file'.format(cls._PREFIX)) @@ -75,20 +81,15 @@ def get_parser(cls): if not name.startswith('_') and not callable(description): arg_name = '--' + cls._add_prefix(name).replace('_', '-') - if isinstance(description, tuple): - if len(description) == 3: - description, default, choices = description - parser.add_argument(arg_name, help=description, - default=default, choices=choices) - else: - description, default = description - if default is False: - parser.add_argument(arg_name, help=description, - action='store_true') - - else: - parser.add_argument(arg_name, help=description, - default=default) + if isinstance(description, dict): + parser.add_argument(arg_name, **description) + + elif isinstance(description, tuple): + description, default = description + parser.add_argument(arg_name, help=description, default=default) + + else: + parser.add_argument(arg_name, help=description) return parser @@ -127,7 +128,11 @@ class SQLConfig(Config): """ Stores configuration for SQL database setup & connection """ _PREFIX = 'sql' - dialect = ('Dialect of SQL to use', 'sqlite', SQL_DIALECTS) + dialect = { + 'help': 'Dialect of SQL to use', + 'default': 'sqlite', + 'choices': SQL_DIALECTS + } database = ('Name of, or path to, SQL database', 'atm.db') username = 'Username for SQL database' password = 'Password for SQL database' @@ -137,16 +142,16 @@ class SQLConfig(Config): class LogConfig(Config): - # log_level_stdout = ('minimum log level to write to stdout', 'ERROR') - # log_level_file =('minimum log level to write to the log file', 'INFO') - # log_dir = ('Directory where logs will be saved', 'logs') model_dir = ('Directory where computed models will be saved', 'models') metric_dir = ('Directory where model metrics will be saved', 'metrics') - verbose_metrics = ( - 'If set, compute full ROC and PR curves and ' - 'per-label metrics for each classifier', - False - ) + verbose_metrics = { + 'help': ( + 'If set, compute full ROC and PR curves and ' + 'per-label metrics for each classifier' + ), + 'action': 'store_true', + 'default': False + } def option_or_path(options, regex=CUSTOM_CLASS_REGEX): @@ -162,193 +167,196 @@ def type_check(s): return s # if both of those fail, there's something wrong - raise argparse.ArgumentTypeError('%s is not a valid option or path!' % s) + raise argparse.ArgumentTypeError('{} is not a valid option or path!'.format(s)) return type_check class RunConfig(Config): - """ Stores configuration for Dataset and Datarun setup """ + """Stores configuration for Dataset and Datarun setup.""" _CONFIG = 'run' - # dataset config - # train_path = None - # test_path = None - # data_description = None - # class_column = None - - # datarun config - dataset_id = None - methods = None - priority = None - budget_type = None - budget = None - deadline = None - tuner = None - r_minimum = None - gridding = None - selector = None - k_window = None - metric = None - score_target = None - - @classmethod - def get_parser(cls): - parser = argparse.ArgumentParser(add_help=False) - - # make sure the text for these arguments is formatted correctly - # this allows newlines in the help strings - parser.formatter_class = argparse.RawTextHelpFormatter - - # Config file - parser.add_argument('--run-config', help='path to yaml datarun config file') - - # Dataset Arguments ##################################################### - # ########################################################################## - parser.add_argument('--dataset-id', type=int, - help="ID of dataset, if it's already in the database") - - # These are only relevant if dataset_id is not provided - # parser.add_argument('--train-path', help='Path to raw training data', - # default=os.path.join(DATA_TEST_PATH, 'pollution_1.csv')) - # parser.add_argument('--test-path', help='Path to raw test data (if applicable)') - # parser.add_argument('--data-description', help='Description of dataset') - # parser.add_argument('--class-column', default='class', - # help='Name of the class column in the input data') - - # Datarun Arguments ##################################################### - # ########################################################################## - # Notes: - # - Support vector machines (svm) can take a long time to train. It's not an - # error, it's just part of what happens when the method happens to explore - # a crappy set of parameters on a powerful algo like this. - # - Stochastic gradient descent (sgd) can sometimes fail on certain - # parameter settings as well. Don't worry, they train SUPER fast, and the - # worker.py will simply log the error and continue. - # - # Method options: - # logreg - logistic regression - # svm - support vector machine - # sgd - linear classifier with stochastic gradient descent - # dt - decision tree - # et - extra trees - # rf - random forest - # gnb - gaussian naive bayes - # mnb - multinomial naive bayes - # bnb - bernoulli naive bayes - # gp - gaussian process - # pa - passive aggressive - # knn - K nearest neighbors - # mlp - multi-layer perceptron - parser.add_argument('--methods', nargs='+', - type=option_or_path(METHODS, JSON_REGEX), - default=['logreg', 'dt', 'knn'], - help='Method or list of methods to use for ' - 'classification. Each method can either be one of the ' - 'pre-defined method codes listed below or a path to a ' - 'JSON file defining a custom method.' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in METHODS)) - parser.add_argument('--priority', type=int, default=1, - help='Priority of the datarun (higher = more important') - parser.add_argument('--budget-type', choices=BUDGET_TYPES, default='classifier', - help='Type of budget to use') - parser.add_argument('--budget', type=int, default=100, - help='Value of the budget, either in classifiers or minutes') - parser.add_argument('--deadline', - help='Deadline for datarun completion. If provided, this ' - 'overrides the configured walltime budget.\nFormat: {}'.format( - TIME_FMT.replace('%', '%%'))) - - # Which field to use to judge performance, for the sake of AutoML - # options: - # f1 - F1 score (harmonic mean of precision and recall) - # roc_auc - area under the Receiver Operating Characteristic curve - # accuracy - percent correct - # cohen_kappa - measures accuracy, but controls for chance of guessing - # correctly - # rank_accuracy - multiclass only: percent of examples for which the true - # label is in the top 1/3 most likely predicted labels - # ap - average precision: nearly identical to area under - # precision/recall curve. - # mcc - matthews correlation coefficient: good for unbalanced classes - # - # f1 and roc_auc may be appended with _micro or _macro to use with - # multiclass problems. - parser.add_argument('--metric', choices=METRICS, default='f1', - help='Metric by which ATM should evaluate classifiers. ' - 'The metric function specified here will be used to ' - 'compute the "judgment metric" for each classifier.') - - # Which data to use for computing judgment score - # cv - cross-validated performance on training data - # test - performance on test data - # mu_sigma - lower confidence bound on cv score - parser.add_argument('--score-target', choices=SCORE_TARGETS, default='cv', - help='Determines which judgment metric will be used to ' - 'search the hyperparameter space. "cv" will use the mean ' - 'cross-validated performance, "test" will use the ' - 'performance on a test dataset, and "mu_sigma" will use ' - 'the lower confidence bound on the CV performance.') - - # AutoML Arguments ###################################################### - # ########################################################################## - # hyperparameter selection strategy - # How should ATM sample hyperparameters from a given hyperpartition? - # uniform - pick randomly! (baseline) - # gp - vanilla Gaussian Process - # gp_ei - Gaussian Process expected improvement criterion - # gp_eivel - Gaussian Process expected improvement, with randomness added - # in based on velocity of improvement - # path to custom tuner, defined in python - parser.add_argument('--tuner', type=option_or_path(TUNERS), default='uniform', - help='Type of BTB tuner to use. Can either be one of ' - 'the pre-configured tuners listed below or a path to a ' - 'custom tuner in the form "/path/to/tuner.py:ClassName".' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in TUNERS)) - - # How should ATM select a particular hyperpartition from the set of all - # possible hyperpartitions? - # Options: - # uniform - pick randomly - # ucb1 - UCB1 multi-armed bandit - # bestk - MAB using only the best K runs in each hyperpartition - # bestkvel - MAB with velocity of best K runs - # purebestkvel - always return hyperpartition with highest velocity - # recentk - MAB with most recent K runs - # recentkvel - MAB with velocity of most recent K runs - # hieralg - hierarchical MAB: choose a classifier first, then choose - # a partition - # path to custom selector, defined in python - parser.add_argument('--selector', type=option_or_path(SELECTORS), default='uniform', - help='Type of BTB selector to use. Can either be one of ' - 'the pre-configured selectors listed below or a path to a ' - 'custom tuner in the form "/path/to/selector.py:ClassName".' - '\n\nOptions: [%s]' % ', '.join(str(s) for s in SELECTORS)) - - # r_minimum is the number of random runs performed in each hyperpartition before - # allowing bayesian opt to select parameters. Consult the thesis to - # understand what those mean, but essentially: - # - # if (num_classifiers_trained_in_hyperpartition >= r_minimum) - # # train using sample criteria - # else - # # train using uniform (baseline) - parser.add_argument('--r-minimum', type=int, default=2, - help='number of random runs to perform before tuning can occur') - - # k is number that xxx-k methods use. It is similar to r_minimum, except it is - # called k_window and determines how much "history" ATM considers for certain - # partition selection logics. - parser.add_argument('--k-window', type=int, default=3, - help='number of previous scores considered by -k selector methods') - - # gridding determines whether or not sample selection will happen on a grid. - # If any positive integer, a grid with `gridding` points on each axis is - # established, and hyperparameter vectors are sampled from this finite - # space. If 0 (or blank), hyperparameters are sampled from continuous - # space, and there is no limit to the number of hyperparameter vectors that - # may be tried. - parser.add_argument('--gridding', type=int, default=0, - help='gridding factor (0: no gridding)') - - return parser + dataset_id = { + 'help': 'ID of dataset, if it is already in the database', + 'type': int + } + + run_per_partition = { + 'help': 'if true, generate a new datarun for each hyperpartition', + 'default': False, + 'action': 'store_true', + } + + # Method options: + # logreg - logistic regression + # svm - support vector machine + # sgd - linear classifier with stochastic gradient descent + # dt - decision tree + # et - extra trees + # rf - random forest + # gnb - gaussian naive bayes + # mnb - multinomial naive bayes + # bnb - bernoulli naive bayes + # gp - gaussian process + # pa - passive aggressive + # knn - K nearest neighbors + # mlp - multi-layer perceptron + # + # Notes: + # - Support vector machines (svm) can take a long time to train. It's not an + # error, it's just part of what happens when the method happens to explore + # a crappy set of parameters on a powerful algo like this. + # - Stochastic gradient descent (sgd) can sometimes fail on certain + # parameter settings as well. Don't worry, they train SUPER fast, and the + # worker.py will simply log the error and continue. + methods = { + 'help': ( + 'Method or list of methods to use for ' + 'classification. Each method can either be one of the ' + 'pre-defined method codes listed below or a path to a ' + 'JSON file defining a custom method.\n\nOptions: [{}]' + ).format(', '.join(str(s) for s in METHODS)), + 'default': ['logreg', 'dt', 'knn'], + 'type': option_or_path(METHODS, JSON_REGEX), + 'nargs': '+' + } + + priority = { + 'help': 'Priority of the datarun (higher = more important', + 'default': 1, + 'type': int + } + budget_type = { + 'help': 'Type of budget to use', + 'default': 'classifier', + 'choices': BUDGET_TYPES, + } + budget = { + 'help': 'Value of the budget, either in classifiers or minutes', + 'default': 100, + 'type': int, + } + deadline = ( + 'Deadline for datarun completion. If provided, this ' + 'overrides the configured walltime budget.\nFormat: {}' + ).format(TIME_FMT.replace('%', '%%')) + + # Which field to use to judge performance, for the sake of AutoML + # options: + # f1 - F1 score (harmonic mean of precision and recall) + # roc_auc - area under the Receiver Operating Characteristic curve + # accuracy - percent correct + # cohen_kappa - measures accuracy, but controls for chance of guessing + # correctly + # rank_accuracy - multiclass only: percent of examples for which the true + # label is in the top 1/3 most likely predicted labels + # ap - average precision: nearly identical to area under + # precision/recall curve. + # mcc - matthews correlation coefficient: good for unbalanced classes + # + # f1 and roc_auc may be appended with _micro or _macro to use with + # multiclass problems. + metric = { + 'help': ( + 'Metric by which ATM should evaluate classifiers. ' + 'The metric function specified here will be used to ' + 'compute the "judgment metric" for each classifier.' + ), + 'default': 'f1', + 'choices': METRICS, + } + + # Which data to use for computing judgment score + # cv - cross-validated performance on training data + # test - performance on test data + # mu_sigma - lower confidence bound on cv score + score_target = { + 'help': ( + 'Determines which judgment metric will be used to ' + 'search the hyperparameter space. "cv" will use the mean ' + 'cross-validated performance, "test" will use the ' + 'performance on a test dataset, and "mu_sigma" will use ' + 'the lower confidence bound on the CV performance.' + ), + 'default': 'cv', + 'choices': SCORE_TARGETS + } + + # AutoML Arguments ###################################################### + # ########################################################################## + + # hyperparameter selection strategy + # How should ATM sample hyperparameters from a given hyperpartition? + # uniform - pick randomly! (baseline) + # gp - vanilla Gaussian Process + # gp_ei - Gaussian Process expected improvement criterion + # gp_eivel - Gaussian Process expected improvement, with randomness added + # in based on velocity of improvement + # path to custom tuner, defined in python + tuner = { + 'help': ( + 'Type of BTB tuner to use. Can either be one of the pre-configured ' + 'tuners listed below or a path to a custom tuner in the form ' + '"/path/to/tuner.py:ClassName".\n\nOptions: [{}]' + ).format(', '.join(str(s) for s in TUNERS)), + 'default': 'uniform', + 'type': option_or_path(TUNERS) + } + + # How should ATM select a particular hyperpartition from the set of all + # possible hyperpartitions? + # Options: + # uniform - pick randomly + # ucb1 - UCB1 multi-armed bandit + # bestk - MAB using only the best K runs in each hyperpartition + # bestkvel - MAB with velocity of best K runs + # purebestkvel - always return hyperpartition with highest velocity + # recentk - MAB with most recent K runs + # recentkvel - MAB with velocity of most recent K runs + # hieralg - hierarchical MAB: choose a classifier first, then choose + # a partition + # path to custom selector, defined in python + selector = { + 'help': ( + 'Type of BTB selector to use. Can either be one of the pre-configured ' + 'selectors listed below or a path to a custom tuner in the form ' + '"/path/to/selector.py:ClassName".\n\nOptions: [{}]' + ).format(', '.join(str(s) for s in SELECTORS)), + 'default': 'uniform', + 'type': option_or_path(SELECTORS) + } + + # r_minimum is the number of random runs performed in each hyperpartition before + # allowing bayesian opt to select parameters. Consult the thesis to + # understand what those mean, but essentially: + # + # if (num_classifiers_trained_in_hyperpartition >= r_minimum) + # # train using sample criteria + # else + # # train using uniform (baseline) + r_minimum = { + 'help': 'number of random runs to perform before tuning can occur', + 'default': 2, + 'type': int + } + + # k is number that xxx-k methods use. It is similar to r_minimum, except it is + # called k_window and determines how much "history" ATM considers for certain + # partition selection logics. + k_window = { + 'help': 'number of previous scores considered by -k selector methods', + 'default': 3, + 'type': int + } + + # gridding determines whether or not sample selection will happen on a grid. + # If any positive integer, a grid with `gridding` points on each axis is + # established, and hyperparameter vectors are sampled from this finite + # space. If 0 (or blank), hyperparameters are sampled from continuous + # space, and there is no limit to the number of hyperparameter vectors that + # may be tried. + gridding = { + 'help': 'gridding factor (0: no gridding)', + 'default': 0, + 'type': int + } diff --git a/atm/core.py b/atm/core.py index 97059c9..c2834bf 100644 --- a/atm/core.py +++ b/atm/core.py @@ -12,7 +12,7 @@ import os import random import time -from builtins import map, object +from builtins import object from datetime import datetime, timedelta from operator import attrgetter @@ -170,7 +170,7 @@ def create_datarun(self, dataset, run_conf): r_minimum=run_conf.r_minimum) return datarun - def create_dataruns(self, run_conf, run_per_partition=False): + def create_dataruns(self, run_conf): """ Generate a datarun, including a dataset if necessary. @@ -188,7 +188,7 @@ def create_dataruns(self, run_conf, run_per_partition=False): # create hyperpartitions and datarun(s) dataruns = [] - if not run_per_partition: + if not run_conf.run_per_partition: LOGGER.debug('saving datarun...') datarun = self.create_datarun(dataset, run_conf) dataruns.append(datarun) @@ -198,7 +198,7 @@ def create_dataruns(self, run_conf, run_per_partition=False): for part in parts: # if necessary, create a new datarun for each hyperpartition. # This setting is useful for debugging. - if run_per_partition: + if run_conf.run_per_partition: datarun = self.create_datarun(dataset, run_conf) dataruns.append(datarun) @@ -210,29 +210,13 @@ def create_dataruns(self, run_conf, run_per_partition=False): categoricals=part.categoricals, status=PartitionStatus.INCOMPLETE) - return dataruns - - def enter_data(self, dataset_conf, run_conf, run_per_partition=False): - """ - Generate a datarun, including a dataset if necessary. - - Returns: ID of the generated datarun - """ - # if the user has provided a dataset id, use that. Otherwise, create a new - # dataset based on the arguments we were passed. - if run_conf.dataset_id is None: - dataset = self.create_dataset(dataset_conf) - run_conf.dataset_id = dataset.id - - dataruns = self.create_dataruns(run_conf, run_per_partition) - - LOGGER.info('Data entry complete. Summary:') + LOGGER.info('Dataruns created. Summary:') LOGGER.info('\tDataset ID: %d', dataset.id) LOGGER.info('\tTraining data: %s', dataset.train_path) LOGGER.info('\tTest data: %s', (dataset.test_path or 'None')) datarun = dataruns[0] - if run_per_partition: + if run_conf.run_per_partition: LOGGER.info('\tDatarun IDs: %s', ', '.join(str(datarun.id) for datarun in dataruns)) else: @@ -241,3 +225,19 @@ def enter_data(self, dataset_conf, run_conf, run_per_partition=False): LOGGER.info('\tHyperpartition selection strategy: %s', datarun.selector) LOGGER.info('\tParameter tuning strategy: %s', datarun.tuner) LOGGER.info('\tBudget: %d (%s)', datarun.budget, datarun.budget_type) + + return dataruns + + def enter_data(self, dataset_conf, run_conf): + """ + Generate a datarun, including a dataset if necessary. + + Returns: ID of the generated datarun + """ + # if the user has provided a dataset id, use that. Otherwise, create a new + # dataset based on the arguments we were passed. + if run_conf.dataset_id is None: + dataset = self.create_dataset(dataset_conf) + run_conf.dataset_id = dataset.id + + self.create_dataruns(run_conf) From a1758bc71ecd7b7eaafa2883beda6da0755bace8 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 2 May 2019 18:08:28 +0200 Subject: [PATCH 31/44] Updated unittests and added return messaage on 400 error at api preprocessing. --- atm/api/preprocessors.py | 28 ++++++----- atm/core.py | 3 +- atm/worker.py | 2 +- tests/{test_models.py => test_core.py} | 64 +++++++++++++++----------- tests/test_worker.py | 23 +++++---- 5 files changed, 69 insertions(+), 51 deletions(-) rename tests/{test_models.py => test_core.py} (66%) diff --git a/atm/api/preprocessors.py b/atm/api/preprocessors.py index ccde6e1..095cd55 100644 --- a/atm/api/preprocessors.py +++ b/atm/api/preprocessors.py @@ -8,20 +8,24 @@ def dataset_post(data): """Preprocess the Dataset POST data.""" if all(key in data for key in DATASET_KEYS): - meta = MetaData( - data['class_column'], - data['train_path'], - data.get('test_path') - ) - - data['n_examples'] = meta.n_examples - data['k_classes'] = meta.k_classes - data['d_features'] = meta.d_features - data['majority'] = meta.majority - data['size_kb'] = meta.size + try: + meta = MetaData( + data['class_column'], + data['train_path'], + data.get('test_path') + ) + + data['n_examples'] = meta.n_examples + data['k_classes'] = meta.k_classes + data['d_features'] = meta.d_features + data['majority'] = meta.majority + data['size_kb'] = meta.size + + except FileNotFoundError: + abort(400, 'The train_path does not exists in the server.') else: - abort(400) + abort(400, 'There is a missing field from the requiered ones') DATASET_PREPROCESSORS = { diff --git a/atm/core.py b/atm/core.py index c2834bf..3575dac 100644 --- a/atm/core.py +++ b/atm/core.py @@ -240,4 +240,5 @@ def enter_data(self, dataset_conf, run_conf): dataset = self.create_dataset(dataset_conf) run_conf.dataset_id = dataset.id - self.create_dataruns(run_conf) + dataruns = self.create_dataruns(run_conf) + return dataruns[0] if not run_conf.run_per_partition else dataruns diff --git a/atm/worker.py b/atm/worker.py index 36bc971..4df3c14 100644 --- a/atm/worker.py +++ b/atm/worker.py @@ -55,7 +55,7 @@ def __init__(self, database, datarun, save_files=True, cloud_mode=False, self.aws_config = aws_config self.public_ip = public_ip - log_config = log_config or LogConfig() + log_config = log_config or LogConfig({}) self.model_dir = log_config.model_dir self.metric_dir = log_config.metric_dir self.verbose_metrics = log_config.verbose_metrics diff --git a/tests/test_models.py b/tests/test_core.py similarity index 66% rename from tests/test_models.py rename to tests/test_core.py index da988a0..69a83c1 100644 --- a/tests/test_models.py +++ b/tests/test_core.py @@ -3,9 +3,9 @@ import pytest from atm import PROJECT_ROOT -from atm.config import RunConfig, SQLConfig +from atm.config import DatasetConfig, RunConfig, SQLConfig +from atm.core import ATM from atm.database import Database, db_session -from atm.models import ATM from atm.utilities import get_local_data_path DB_PATH = '/tmp/atm.db' @@ -52,6 +52,8 @@ def test_create_dataset(db): train_url = DATA_URL + 'pollution_1_train.csv' test_url = DATA_URL + 'pollution_1_test.csv' + sql_conf = SQLConfig({'sql_database': DB_PATH}) + train_path_local, _ = get_local_data_path(train_url) if os.path.exists(train_path_local): os.remove(train_path_local) @@ -60,14 +62,16 @@ def test_create_dataset(db): if os.path.exists(test_path_local): os.remove(test_path_local) - run_conf = RunConfig(train_path=train_url, - test_path=test_url, - data_description='test', - class_column='class') + dataset_conf = DatasetConfig({ + 'train_path': train_url, + 'test_path': test_url, + 'data_description': 'test', + 'class_column': 'class' + }) - atm = ATM(db, run_conf, None, None) + atm = ATM(sql_conf, None, None) - dataset = atm.create_dataset() + dataset = atm.create_dataset(dataset_conf) dataset = db.get_dataset(dataset.id) assert os.path.exists(train_path_local) @@ -84,53 +88,57 @@ def test_create_dataset(db): def test_enter_data_by_methods(dataset): - sql_conf = SQLConfig(database=DB_PATH) - db = Database(**vars(sql_conf)) - run_conf = RunConfig(dataset_id=dataset.id) + sql_conf = SQLConfig({'sql_database': DB_PATH}) + db = Database(**sql_conf.to_dict()) + run_conf = RunConfig({'dataset_id': dataset.id}) - atm = ATM(db, run_conf, None, None) + atm = ATM(sql_conf, None, None) for method, n_parts in METHOD_HYPERPARTS.items(): run_conf.methods = [method] - run_id = atm.enter_data() + run_id = atm.enter_data(None, run_conf) - assert db.get_datarun(run_id) with db_session(db): - run = db.get_datarun(run_id) + run = db.get_datarun(run_id.id) assert run.dataset.id == dataset.id assert len(run.hyperpartitions) == n_parts def test_enter_data_all(dataset): - sql_conf = SQLConfig(database=DB_PATH) - db = Database(**vars(sql_conf)) - run_conf = RunConfig(dataset_id=dataset.id, - methods=METHOD_HYPERPARTS.keys()) + sql_conf = SQLConfig({'sql_database': DB_PATH}) + db = Database(**sql_conf.to_dict()) + run_conf = RunConfig({'dataset_id': dataset.id, 'methods': METHOD_HYPERPARTS.keys()}) - atm = ATM(db, run_conf, None, None) + atm = ATM(sql_conf, None, None) - run_id = atm.enter_data() + run_id = atm.enter_data(None, run_conf) with db_session(db): - run = db.get_datarun(run_id) + run = db.get_datarun(run_id.id) assert run.dataset.id == dataset.id assert len(run.hyperpartitions) == sum(METHOD_HYPERPARTS.values()) def test_run_per_partition(dataset): - sql_conf = SQLConfig(database=DB_PATH) - db = Database(**vars(sql_conf)) + sql_conf = SQLConfig({'sql_database': DB_PATH}) + db = Database(**sql_conf.to_dict()) - run_conf = RunConfig(dataset_id=dataset.id, methods=['logreg']) + run_conf = RunConfig( + { + 'dataset_id': dataset.id, + 'methods': ['logreg'], + 'run_per_partition': True + } + ) - atm = ATM(db, run_conf, None, None) + atm = ATM(sql_conf, None, None) - run_ids = atm.enter_data(run_per_partition=True) + run_ids = atm.enter_data(None, run_conf) with db_session(db): runs = [] for run_id in run_ids: - run = db.get_datarun(run_id) + run = db.get_datarun(run_id.id) if run is not None: runs.append(run) diff --git a/tests/test_worker.py b/tests/test_worker.py index ca1bb8a..3c83fc0 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -12,10 +12,10 @@ from atm import PROJECT_ROOT from atm.classifier import Model -from atm.config import LogConfig, RunConfig, SQLConfig +from atm.config import DatasetConfig, LogConfig, RunConfig, SQLConfig from atm.constants import METRICS_BINARY, TIME_FMT +from atm.core import ATM from atm.database import Database, db_session -from atm.models import ATM from atm.utilities import download_data, load_metrics, load_model from atm.worker import ClassifierError, Worker @@ -107,12 +107,17 @@ def worker(db, datarun): def get_new_worker(**kwargs): kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt']) - sql_conf = SQLConfig(database=DB_PATH) - run_conf = RunConfig(**kwargs) - db = Database(**vars(sql_conf)) - atm = ATM(db, run_conf, None, None) - run_id = atm.enter_data() - datarun = db.get_datarun(run_id) + sql_conf = SQLConfig({'sql_database': DB_PATH}) + run_conf = RunConfig(kwargs) + + dataset_conf = DatasetConfig(kwargs) + + db = Database(**sql_conf.to_dict()) + atm = ATM(sql_conf, None, None) + + run_id = atm.enter_data(dataset_conf, run_conf) + datarun = db.get_datarun(run_id.id) + return Worker(db, datarun) @@ -183,7 +188,7 @@ def test_test_classifier(db, dataset): def test_save_classifier(db, datarun, model, metrics): - log_conf = LogConfig(model_dir=MODEL_DIR, metric_dir=METRIC_DIR) + log_conf = LogConfig({'model_dir': MODEL_DIR, 'metric_dir': METRIC_DIR}) worker = Worker(db, datarun, log_config=log_conf) hp = db.get_hyperpartitions(datarun_id=worker.datarun.id)[0] classifier = worker.db.start_classifier(hyperpartition_id=hp.id, From 43382e98c4be787fa5e91751b1c7927350e23333 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 2 May 2019 19:18:29 +0200 Subject: [PATCH 32/44] Updated API doc --- API.md | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/API.md b/API.md index ba97adf..0d20654 100644 --- a/API.md +++ b/API.md @@ -14,7 +14,6 @@ virtualenv, and execute this command: atm start ``` - This will start **ATM** server as a background service. The REST server will be listening at the port 5000 of your machine, and if you point your browser at http://127.0.0.1:5000/, you will see the documentation website that shows information about all the REST operations allowed by the API. @@ -293,6 +292,87 @@ And the output will be (note that some parts have been cut): } ``` +### Perform a POST + +#### POST Dataset + +If you would like to create a `dataset` from the **ATM** REST API you can perform a `POST` action +where the required fields are: + +* `name`, desired name for the dataset. +* `description`, the description for the dataset. +* `train_path`, where the `.csv` file is located. +* `class_column`, target column. + +Additionally we can paass the `test_path`, which points to the testing dataset `csv`. + +An example of such a POST would be: + +```bash +curl localhost:5000/api/datasets -H 'Content-Type: application/json' \ +-d '{"name": "test", "train_path": "atm/data/test/pollution_1.csv", "class_column": "class", "description": "testing"}' +``` + +An output similar to this one should appear in your console: + +```bash +{ + "size_kb" : 8, + "name" : "test", + "majority" : 0.516666667, + "n_examples" : 60, + "id" : 2, + "description" : "testing", + "train_path" : "atm/data/test/pollution_1.csv", + "dataruns" : [], + "class_column" : "class", + "test_path" : null, + "k_classes" : 2, + "d_features" : 16 +} + +``` + + +#### Create a Datarun from a Dataset + +If you would like to create the `dataruns` for the dataset that we just created, you can do so by +making a `POST` call similar to the one before poiting to: `http://127.0.0.1:5000/api/run` . + +This post data requires atleast the `dataset_id` parameter. + +Optionally accepts the following parameters: + +* `description` +* `run_per_partition` +* `tuner` +* `selector` +* `gridding` +* `priority` +* `budget_type` +* `budget` +* `metric` +* `k_window` +* `r_minimum` +* `score_target` +* `deadline` + +Information about the values that can be contained above can be found +[here](https://hdi-project.github.io/ATM/database.html#dataruns) + +A simple `POST` to this endpoint: + +```bash +curl localhost:5000/api/run -H 'Content-Type: application/json' -d '{"dataset_id": 2}' +``` + +An output like this should print in the console: + +```bash +{"datarun_ids":[37],"status":"OK"} +``` + +If you have any workers running with the server, this will launch the workers process. ## Additional information From bcb398b6d2dd7f9a29825aae2621de0af29f0efe Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Thu, 2 May 2019 19:39:46 +0200 Subject: [PATCH 33/44] Update API docs --- API.md | 182 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 95 insertions(+), 87 deletions(-) diff --git a/API.md b/API.md index 0d20654..3704017 100644 --- a/API.md +++ b/API.md @@ -61,20 +61,110 @@ For more detailed information about all the operations supported by the API, ple browser to http://127.0.0.1:5000/ and explore the examples provided by the [Swagger](https://swagger.io/) interface. +### 0. Make a simple GET to see what's inside the datasets + +You can run a simple `GET` petition to the database in order to check if there is somenthing inside +the database. If it's the first time running, there should be no data inside: + +```bash +curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets' +``` + +An ouput like this should be printed if you don't have any data: + +```bash +{ + "page" : 1, + "num_results" : 0, + "objects" : [], + "total_pages" : 0 +} +``` + ### 1. Generate some data Before proceeding any further, please make sure the have already populated your data by triggering at least one model tuning process. -An easy way to do this is to follow the quickstart from the ATM [README.md](README.md) file, -which means having run these command: +#### POST Dataset + +First you need to create a `dataset`. From the **ATM** REST API you can perform a `POST` action +to `api/datasets/` where the required fields are: + +* `name`, desired name for the dataset. +* `description`, the description for the dataset. +* `train_path`, where the `.csv` file is located. +* `class_column`, target column. + +Additionally we can paass the `test_path`, which points to the testing dataset `csv`. +This call will create a simple `dataset` in our database: + +```bash +curl localhost:5000/api/datasets -H 'Content-Type: application/json' \ +-d '{"name": "test", "train_path": "atm/data/test/pollution_1.csv", "class_column": "class", "description": "testing"}' ``` -atm enter_data + +An output similar to this one should appear in your console: + +```bash +{ + "size_kb" : 8, + "name" : "test", + "majority" : 0.516666667, + "n_examples" : 60, + "id" : 1, + "description" : "testing", + "train_path" : "atm/data/test/pollution_1.csv", + "dataruns" : [], + "class_column" : "class", + "test_path" : null, + "k_classes" : 2, + "d_features" : 16 +} +``` + + +#### Create a Datarun from a Dataset + +If you would like to create the `dataruns` for the dataset that we just created, you can do so by +making a `POST` call similar to the one before poiting to: `http://127.0.0.1:5000/api/run` . + +This post data requires atleast the `dataset_id` parameter. + +Optionally accepts the following parameters: + +* `description` +* `run_per_partition` +* `tuner` +* `selector` +* `gridding` +* `priority` +* `budget_type` +* `budget` +* `metric` +* `k_window` +* `r_minimum` +* `score_target` +* `deadline` + +Information about the values that can be contained above can be found +[here](https://hdi-project.github.io/ATM/database.html#dataruns) + +A simple `POST` to this endpoint: + +```bash +curl localhost:5000/api/run -H 'Content-Type: application/json' -d '{"dataset_id": 1}' +``` + +An output like this should print in the console: + +```bash +{"datarun_ids":[37],"status":"OK"} ``` -The workers that you started before will proceed the data that has been inserted and will populate -the database. +The workers will then start working on this `dataruns` and once they are done (usually it takes +arround 1-5 minutes depending on your computer / workers) you can proceed with the following steps. ### 2. REST Models @@ -292,88 +382,6 @@ And the output will be (note that some parts have been cut): } ``` -### Perform a POST - -#### POST Dataset - -If you would like to create a `dataset` from the **ATM** REST API you can perform a `POST` action -where the required fields are: - -* `name`, desired name for the dataset. -* `description`, the description for the dataset. -* `train_path`, where the `.csv` file is located. -* `class_column`, target column. - -Additionally we can paass the `test_path`, which points to the testing dataset `csv`. - -An example of such a POST would be: - -```bash -curl localhost:5000/api/datasets -H 'Content-Type: application/json' \ --d '{"name": "test", "train_path": "atm/data/test/pollution_1.csv", "class_column": "class", "description": "testing"}' -``` - -An output similar to this one should appear in your console: - -```bash -{ - "size_kb" : 8, - "name" : "test", - "majority" : 0.516666667, - "n_examples" : 60, - "id" : 2, - "description" : "testing", - "train_path" : "atm/data/test/pollution_1.csv", - "dataruns" : [], - "class_column" : "class", - "test_path" : null, - "k_classes" : 2, - "d_features" : 16 -} - -``` - - -#### Create a Datarun from a Dataset - -If you would like to create the `dataruns` for the dataset that we just created, you can do so by -making a `POST` call similar to the one before poiting to: `http://127.0.0.1:5000/api/run` . - -This post data requires atleast the `dataset_id` parameter. - -Optionally accepts the following parameters: - -* `description` -* `run_per_partition` -* `tuner` -* `selector` -* `gridding` -* `priority` -* `budget_type` -* `budget` -* `metric` -* `k_window` -* `r_minimum` -* `score_target` -* `deadline` - -Information about the values that can be contained above can be found -[here](https://hdi-project.github.io/ATM/database.html#dataruns) - -A simple `POST` to this endpoint: - -```bash -curl localhost:5000/api/run -H 'Content-Type: application/json' -d '{"dataset_id": 2}' -``` - -An output like this should print in the console: - -```bash -{"datarun_ids":[37],"status":"OK"} -``` - -If you have any workers running with the server, this will launch the workers process. - ## Additional information ### Start additional process with different pid file From d663a7ce05049fa621fb52dde9963106d4922de0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 20:27:05 +0200 Subject: [PATCH 34/44] Add atm.api.utils.abort --- atm/api/__init__.py | 8 +------- atm/api/preprocessors.py | 44 ++++++++++++++++++++-------------------- atm/api/utils.py | 24 ++++++++++++++++++++++ 3 files changed, 47 insertions(+), 29 deletions(-) create mode 100644 atm/api/utils.py diff --git a/atm/api/__init__.py b/atm/api/__init__.py index 3380741..b588c44 100644 --- a/atm/api/__init__.py +++ b/atm/api/__init__.py @@ -5,16 +5,10 @@ from flask_sqlalchemy import SQLAlchemy from atm.api.preprocessors import DATASET_PREPROCESSORS +from atm.api.utils import make_absolute from atm.config import RunConfig -def make_absolute(url): - if str(url).startswith('sqlite:///'): - url = 'sqlite:///' + os.path.abspath(url.database) - - return url - - def create_app(atm, debug=False): db = atm.db app = Flask(__name__) diff --git a/atm/api/preprocessors.py b/atm/api/preprocessors.py index 095cd55..758f386 100644 --- a/atm/api/preprocessors.py +++ b/atm/api/preprocessors.py @@ -1,31 +1,31 @@ -from flask import abort +import os +import traceback +from atm.api.utils import abort from atm.encoder import MetaData -DATASET_KEYS = ['name', 'description', 'train_path', 'class_column'] - def dataset_post(data): """Preprocess the Dataset POST data.""" - if all(key in data for key in DATASET_KEYS): - try: - meta = MetaData( - data['class_column'], - data['train_path'], - data.get('test_path') - ) - - data['n_examples'] = meta.n_examples - data['k_classes'] = meta.k_classes - data['d_features'] = meta.d_features - data['majority'] = meta.majority - data['size_kb'] = meta.size - - except FileNotFoundError: - abort(400, 'The train_path does not exists in the server.') - - else: - abort(400, 'There is a missing field from the requiered ones') + + try: + train_path = data['train_path'] + name = data.setdefault('name', os.path.basename(train_path)) + data.setdefault('description', name) + meta = MetaData( + data['class_column'], + train_path, + data.get('test_path') + ) + + data['n_examples'] = meta.n_examples + data['k_classes'] = meta.k_classes + data['d_features'] = meta.d_features + data['majority'] = meta.majority + data['size_kb'] = meta.size + + except Exception as ex: + abort(400, error=ex) DATASET_PREPROCESSORS = { diff --git a/atm/api/utils.py b/atm/api/utils.py new file mode 100644 index 0000000..742280f --- /dev/null +++ b/atm/api/utils.py @@ -0,0 +1,24 @@ +import os +import traceback + +import flask + + +def make_absolute(url): + if str(url).startswith('sqlite:///'): + url = 'sqlite:///' + os.path.abspath(url.database) + + return url + + +def abort(code, message=None, error=None): + if error is not None: + error = traceback.format_exception_only(type(error), error)[0] + + response = flask.jsonify({ + 'status': code, + 'error': error, + 'message': message + }) + response.status_code = code + flask.abort(response) From 72c6ddc226e745e39368d38610c6860ea25faf32 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 20:28:33 +0200 Subject: [PATCH 35/44] Strip error message --- atm/api/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/atm/api/utils.py b/atm/api/utils.py index 742280f..1877aa2 100644 --- a/atm/api/utils.py +++ b/atm/api/utils.py @@ -13,7 +13,7 @@ def make_absolute(url): def abort(code, message=None, error=None): if error is not None: - error = traceback.format_exception_only(type(error), error)[0] + error = traceback.format_exception_only(type(error), error)[0].strip() response = flask.jsonify({ 'status': code, From 94542d2fce1e3f619253209520574706be525390 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:17:39 +0200 Subject: [PATCH 36/44] Raise KeyError if a required argument is not given --- atm/config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/atm/config.py b/atm/config.py index c125932..9d79704 100644 --- a/atm/config.py +++ b/atm/config.py @@ -40,13 +40,20 @@ def _add_prefix(cls, name): def _get_arg(cls, args, name): arg_name = cls._add_prefix(name) class_value = getattr(cls, name) + required = False if isinstance(class_value, dict): + required = 'default' not in class_value default = class_value.get('default') elif isinstance(class_value, tuple): + required = False default = class_value[1] else: + required = False default = None + if required and arg_name not in args: + raise KeyError(arg_name) + return args.get(arg_name, default) def __init__(self, args, path=None): From 15583046ac077945542c1e0baee234c92c2b95f1 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:17:57 +0200 Subject: [PATCH 37/44] Raise ValueError if an invalid dataset_id is given --- atm/core.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/atm/core.py b/atm/core.py index 3575dac..b7946ea 100644 --- a/atm/core.py +++ b/atm/core.py @@ -177,6 +177,8 @@ def create_dataruns(self, run_conf): Returns: ID of the generated datarun """ dataset = self.db.get_dataset(run_conf.dataset_id) + if not dataset: + raise ValueError('Invalid Dataset ID: {}'.format(run_conf.dataset_id)) method_parts = {} for m in run_conf.methods: From 334bd3a0f3f34ae2bac456aa1a9ed8be2c813621 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:18:10 +0200 Subject: [PATCH 38/44] Add get_demos command --- atm/cli.py | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/atm/cli.py b/atm/cli.py index 69735e5..dd5119d 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -197,17 +197,31 @@ def _enter_data(args): atm.enter_data(dataset_conf, run_conf) -def _make_config(args): - config_templates = os.path.join('config', 'templates') - config_dir = os.path.join(os.path.dirname(__file__), config_templates) - target_dir = os.path.join(os.getcwd(), config_templates) +def _copy_files(pattern, source, target=None): + if isinstance(source, (list, tuple)): + source = os.path.join(*source) + + if target is None: + target = source + + source_dir = os.path.join(os.path.dirname(__file__), source) + target_dir = os.path.join(os.getcwd(), target) + if not os.path.exists(target_dir): os.makedirs(target_dir) - for template in glob.glob(os.path.join(config_dir, '*.yaml')): - target_file = os.path.join(target_dir, os.path.basename(template)) + for source_file in glob.glob(os.path.join(source_dir, pattern)): + target_file = os.path.join(target_dir, os.path.basename(source_file)) print('Generating file {}'.format(target_file)) - shutil.copy(template, target_file) + shutil.copy(source_file, target_file) + + +def _make_config(args): + _copy_files('*.yaml', ('config', 'templates')) + + +def _get_demos(args): + _copy_files('*.csv', ('data', 'test'), 'demos') def _get_parser(): @@ -318,6 +332,10 @@ def _get_parser(): make_config = subparsers.add_parser('make_config', parents=[logging_args]) make_config.set_defaults(action=_make_config) + # Get Demos + get_demos = subparsers.add_parser('get_demos', parents=[logging_args]) + get_demos.set_defaults(action=_get_demos) + return parser From 2b34604284a4d4fbc279b4543087cdb4a6772e39 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:23:57 +0200 Subject: [PATCH 39/44] Add CLI help messages --- atm/cli.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/atm/cli.py b/atm/cli.py index dd5119d..1d19e3e 100644 --- a/atm/cli.py +++ b/atm/cli.py @@ -229,7 +229,8 @@ def _get_parser(): logging_args.add_argument('-v', '--verbose', action='count', default=0) logging_args.add_argument('-l', '--logfile') - parser = argparse.ArgumentParser(description='ATM Command Line Interface') + parser = argparse.ArgumentParser(description='ATM Command Line Interface', + parents=[logging_args]) subparsers = parser.add_subparsers(title='action', help='Action to perform') parser.set_defaults(action=None) @@ -250,7 +251,8 @@ def _get_parser(): log_args, run_args ] - enter_data = subparsers.add_parser('enter_data', parents=enter_data_parents) + enter_data = subparsers.add_parser('enter_data', parents=enter_data_parents, + help='Add a Dataset and trigger a Datarun on it.') enter_data.set_defaults(action=_enter_data) # Wroker Args @@ -268,7 +270,8 @@ def _get_parser(): aws_args, log_args ] - worker = subparsers.add_parser('worker', parents=worker_parents) + worker = subparsers.add_parser('worker', parents=worker_parents, + help='Start a single worker in foreground.') worker.set_defaults(action=_work) worker.add_argument('--dataruns', help='Only train on dataruns with these ids', nargs='+') worker.add_argument('--total-time', help='Number of seconds to run worker', type=int) @@ -279,7 +282,8 @@ def _get_parser(): server_args.add_argument('--port', help='Port to listen at', type=int) # Server - server = subparsers.add_parser('server', parents=[logging_args, server_args, sql_args]) + server = subparsers.add_parser('server', parents=[logging_args, server_args, sql_args], + help='Start the REST API Server in foreground.') server.set_defaults(action=_serve) server.add_argument('--debug', help='Start in debug mode', action='store_true') # add_arguments_sql(server) @@ -306,7 +310,8 @@ def _get_parser(): aws_args, log_args ] - start = subparsers.add_parser('start', parents=start_parents) + start = subparsers.add_parser('start', parents=start_parents, + help='Start an ATM Local Cluster.') start.set_defaults(action=_start) # Status @@ -321,19 +326,23 @@ def _get_parser(): help='Kill the process if it does not terminate gracefully.') # Stop - stop = subparsers.add_parser('stop', parents=[logging_args, stop_args, background_args]) + stop = subparsers.add_parser('stop', parents=[logging_args, stop_args, background_args], + help='Stop an ATM Local Cluster.') stop.set_defaults(action=_stop) # restart - restart = subparsers.add_parser('restart', parents=start_parents + [stop_args]) + restart = subparsers.add_parser('restart', parents=start_parents + [stop_args], + help='Restart an ATM Local Cluster.') restart.set_defaults(action=_restart) # Make Config - make_config = subparsers.add_parser('make_config', parents=[logging_args]) + make_config = subparsers.add_parser('make_config', parents=[logging_args], + help='Generate a config templates folder in the cwd.') make_config.set_defaults(action=_make_config) # Get Demos - get_demos = subparsers.add_parser('get_demos', parents=[logging_args]) + get_demos = subparsers.add_parser('get_demos', parents=[logging_args], + help='Generate a demos folder with demo CSVs in the cwd.') get_demos.set_defaults(action=_get_demos) return parser From be204885de0610f33d3416a33ed240cfe5261a0b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:24:28 +0200 Subject: [PATCH 40/44] Add auto_abort decorator for nice error JSON responses --- atm/api/__init__.py | 12 ++++-------- atm/api/preprocessors.py | 35 ++++++++++++++++------------------- atm/api/utils.py | 19 +++++++++++++++++++ 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/atm/api/__init__.py b/atm/api/__init__.py index b588c44..a838085 100644 --- a/atm/api/__init__.py +++ b/atm/api/__init__.py @@ -1,11 +1,9 @@ -import os - -from flask import Flask, abort, jsonify, redirect, request +from flask import Flask, jsonify, redirect, request from flask_restless_swagger import SwagAPIManager as APIManager from flask_sqlalchemy import SQLAlchemy from atm.api.preprocessors import DATASET_PREPROCESSORS -from atm.api.utils import make_absolute +from atm.api.utils import auto_abort, make_absolute from atm.config import RunConfig @@ -19,17 +17,15 @@ def create_app(atm, debug=False): manager = APIManager(app, flask_sqlalchemy_db=SQLAlchemy(app)) @app.route('/api/run', methods=['POST']) + @auto_abort((KeyError, ValueError)) def atm_run(): - if not request.json: - abort(400) - data = request.json run_conf = RunConfig(data) dataruns = atm.create_dataruns(run_conf) response = { - 'status': 'OK', + 'status': 200, 'datarun_ids': [datarun.id for datarun in dataruns] } diff --git a/atm/api/preprocessors.py b/atm/api/preprocessors.py index 758f386..38896f2 100644 --- a/atm/api/preprocessors.py +++ b/atm/api/preprocessors.py @@ -1,31 +1,28 @@ import os import traceback -from atm.api.utils import abort +from atm.api.utils import auto_abort from atm.encoder import MetaData +@auto_abort((KeyError, FileNotFoundError)) def dataset_post(data): """Preprocess the Dataset POST data.""" - try: - train_path = data['train_path'] - name = data.setdefault('name', os.path.basename(train_path)) - data.setdefault('description', name) - meta = MetaData( - data['class_column'], - train_path, - data.get('test_path') - ) - - data['n_examples'] = meta.n_examples - data['k_classes'] = meta.k_classes - data['d_features'] = meta.d_features - data['majority'] = meta.majority - data['size_kb'] = meta.size - - except Exception as ex: - abort(400, error=ex) + train_path = data['train_path'] + name = data.setdefault('name', os.path.basename(train_path)) + data.setdefault('description', name) + meta = MetaData( + data['class_column'], + train_path, + data.get('test_path') + ) + + data['n_examples'] = meta.n_examples + data['k_classes'] = meta.k_classes + data['d_features'] = meta.d_features + data['majority'] = meta.majority + data['size_kb'] = meta.size DATASET_PREPROCESSORS = { diff --git a/atm/api/utils.py b/atm/api/utils.py index 1877aa2..b809a03 100644 --- a/atm/api/utils.py +++ b/atm/api/utils.py @@ -1,8 +1,11 @@ +import logging import os import traceback import flask +LOGGER = logging.getLogger(__name__) + def make_absolute(url): if str(url).startswith('sqlite:///'): @@ -22,3 +25,19 @@ def abort(code, message=None, error=None): }) response.status_code = code flask.abort(response) + + +def auto_abort(exceptions): + def outer(function): + def inner(*args, **kwargs): + try: + return function(*args, **kwargs) + except exceptions as ex: + abort(400, error=ex) + except Exception as ex: + LOGGER.exception('Uncontrolled Exception Caught') + abort(500, error=ex) + + return inner + + return outer From 03f20317940cf29ff09f5c1dcdb530806de71149 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 2 May 2019 21:25:05 +0200 Subject: [PATCH 41/44] Remove unused import --- atm/api/preprocessors.py | 1 - 1 file changed, 1 deletion(-) diff --git a/atm/api/preprocessors.py b/atm/api/preprocessors.py index 38896f2..3937aa4 100644 --- a/atm/api/preprocessors.py +++ b/atm/api/preprocessors.py @@ -1,5 +1,4 @@ import os -import traceback from atm.api.utils import auto_abort from atm.encoder import MetaData From 9e9a80ccb116880c6cddc91e9b35b0896d3da47c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 3 May 2019 11:33:44 +0200 Subject: [PATCH 42/44] Update API.md / Fix test_worker --- API.md | 136 ++++++++++++++----------------------------- tests/test_worker.py | 1 + 2 files changed, 46 insertions(+), 91 deletions(-) diff --git a/API.md b/API.md index 3704017..4dd86b6 100644 --- a/API.md +++ b/API.md @@ -5,7 +5,17 @@ it via a REST API server that runs over [flask](http://flask.pocoo.org/). In this document you will find a briefly explanation how to start it and use it. -## Starting the REST API Server + +## Quickstart + +In this section we will briefly show the basic usage of the REST API. + +For more detailed information about all the operations supported by the API, please point your +browser to http://127.0.0.1:5000/ and explore the examples provided by the +[Swagger](https://swagger.io/) interface. + + +### 1. Start the REST API Server In order to start a REST API server, after installing ATM open a terminal, activate its virtualenv, and execute this command: @@ -53,120 +63,64 @@ atm start --workers 4 For more detailed options you can run `atm start --help` to obtain a list with the arguments that are being accepted. -## Quickstart +### 2. Create a Dataset -In this section we will briefly show the basic usage of the REST API. - -For more detailed information about all the operations supported by the API, please point your -browser to http://127.0.0.1:5000/ and explore the examples provided by the -[Swagger](https://swagger.io/) interface. +Once the server is running, you can register your first dataset using the API. To do so, you need +to send the path to your `CSV` file and the name of your `target_column` in a `POST` request to +`api/datasets`. -### 0. Make a simple GET to see what's inside the datasets - -You can run a simple `GET` petition to the database in order to check if there is somenthing inside -the database. If it's the first time running, there should be no data inside: +This call will create a simple `dataset` in our database: ```bash -curl -X GET --header 'Accept: application/json, application/json' 'http://127.0.0.1:5000/api/datasets' -``` +POST /api/datasets HTTP/1.1 +Content-Type: application/json -An ouput like this should be printed if you don't have any data: - -```bash { - "page" : 1, - "num_results" : 0, - "objects" : [], - "total_pages" : 0 + "class_column": "your_target_column", + "train_path": "path/to/your.csv" } ``` -### 1. Generate some data - -Before proceeding any further, please make sure the have already populated your data by triggering -at least one model tuning process. - -#### POST Dataset - -First you need to create a `dataset`. From the **ATM** REST API you can perform a `POST` action -to `api/datasets/` where the required fields are: - -* `name`, desired name for the dataset. -* `description`, the description for the dataset. -* `train_path`, where the `.csv` file is located. -* `class_column`, target column. - -Additionally we can paass the `test_path`, which points to the testing dataset `csv`. - -This call will create a simple `dataset` in our database: +Once you have created some datasets, you can see them by sending a `GET` request: ```bash -curl localhost:5000/api/datasets -H 'Content-Type: application/json' \ --d '{"name": "test", "train_path": "atm/data/test/pollution_1.csv", "class_column": "class", "description": "testing"}' +GET /api/datasets HTTP/1.1 ``` -An output similar to this one should appear in your console: +This will return a `json` with all the information about the stored datasets. + +As an example, you can get and register a demo dataset by running the following two commands: ```bash -{ - "size_kb" : 8, - "name" : "test", - "majority" : 0.516666667, - "n_examples" : 60, - "id" : 1, - "description" : "testing", - "train_path" : "atm/data/test/pollution_1.csv", - "dataruns" : [], - "class_column" : "class", - "test_path" : null, - "k_classes" : 2, - "d_features" : 16 -} +atm get_demos +curl -v localhost:5000/api/datasets -H'Content-Type: application/json' \ +-d'{"class_column": "class", "train_path": "demos/pollution_1.csv"}' ``` +### 3. Trigger a Datarun -#### Create a Datarun from a Dataset - -If you would like to create the `dataruns` for the dataset that we just created, you can do so by -making a `POST` call similar to the one before poiting to: `http://127.0.0.1:5000/api/run` . - -This post data requires atleast the `dataset_id` parameter. - -Optionally accepts the following parameters: - -* `description` -* `run_per_partition` -* `tuner` -* `selector` -* `gridding` -* `priority` -* `budget_type` -* `budget` -* `metric` -* `k_window` -* `r_minimum` -* `score_target` -* `deadline` - -Information about the values that can be contained above can be found -[here](https://hdi-project.github.io/ATM/database.html#dataruns) - -A simple `POST` to this endpoint: +In order to trigger a datarun, once you have created a dataset, you have to send the `dataset_id` +in a `POST` request to `api/run` to trigger the `workers` with the default values. ```bash -curl localhost:5000/api/run -H 'Content-Type: application/json' -d '{"dataset_id": 1}' +POST /api/datasets HTTP/1.1 +Content-type: application/json + +{ + "dataset_id": id_of_your_dataset +} ``` -An output like this should print in the console: +If you have followed the above example and created a `pollution` dataset in the database, you can +run the following `POST` to trigger it's datarun: ```bash -{"datarun_ids":[37],"status":"OK"} +curl -v localhost:5000/api/run -H'Content-type: application/json' -d'{"dataset_id": 1}' ``` -The workers will then start working on this `dataruns` and once they are done (usually it takes -arround 1-5 minutes depending on your computer / workers) you can proceed with the following steps. +**NOTE** atleast one worker should be running in order to process the datarun. -### 2. REST Models +### 4. Browse the results Once the database is populated, you can use the REST API to explore the following 4 models: @@ -177,7 +131,7 @@ Once the database is populated, you can use the REST API to explore the followin And these are the operations that can be performed on them: -### 3. Get all objects from a model +#### Get all objects from a model In order to get all the objects for a single model, you need to make a `GET` request to `/api/`. @@ -241,7 +195,7 @@ And the output will be: } ``` -### 4. Get a single object by id +#### Get a single object by id In order to get one particular objects for a model, you need to make a `GET` request to `/api//`. @@ -293,7 +247,7 @@ And the output will be: } ``` -### 5. Get all the children objects +#### Get all the children objects In order to get all the childre objects from one parent object, you need to make a `GET` request to `/api///`. diff --git a/tests/test_worker.py b/tests/test_worker.py index 3c83fc0..1602a66 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -106,6 +106,7 @@ def worker(db, datarun): def get_new_worker(**kwargs): + kwargs['dataset_id'] = kwargs.get('dataset_id', None) kwargs['methods'] = kwargs.get('methods', ['logreg', 'dt']) sql_conf = SQLConfig({'sql_database': DB_PATH}) run_conf = RunConfig(kwargs) From 7c7077d45b283fdf6059f794f55faf645f85a296 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Date: Fri, 3 May 2019 12:32:24 +0200 Subject: [PATCH 43/44] Info about workers added. --- API.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/API.md b/API.md index 4dd86b6..a2a8793 100644 --- a/API.md +++ b/API.md @@ -120,6 +120,14 @@ curl -v localhost:5000/api/run -H'Content-type: application/json' -d'{"dataset_i **NOTE** atleast one worker should be running in order to process the datarun. +While running, the workers, will log what they are doing in the file `atm.log`. + +In order to monitor their activity in real time, you can execute this on another terminal: + +```bash +tail -f atm.log +``` + ### 4. Browse the results Once the database is populated, you can use the REST API to explore the following 4 models: From 940659a4e036852fcd1121fa1a3310470b4e26da Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 7 May 2019 20:36:27 +0200 Subject: [PATCH 44/44] Prepare release v0.1.2 --- HISTORY.md | 14 ++++++++++++++ setup.py | 6 +----- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 7e26a33..3aed563 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,19 @@ # History +## 0.1.2 (2019-05-07) + +REST API and Cluster Management. + +### New Features + +* REST API Server - Issues [#82](https://github.com/HDI-Project/ATM/issues/82) and + [#132](https://github.com/HDI-Project/ATM/issues/132) by @RogerTangos, @pvk-developer and @csala +* Add Cluster Management commands to start and stop the server and multiple workers + as background processes - Issue [#130](https://github.com/HDI-Project/ATM/issues/130) by + @pvk-developer and @csala +* Add TravisCI and migrate docs to GitHub Pages - Issue [#129](https://github.com/HDI-Project/ATM/issues/129) + by @pvk-developer + ## 0.1.1 (2019-04-02) First Release on PyPi. diff --git a/setup.py b/setup.py index 5f04d9d..0ae2840 100644 --- a/setup.py +++ b/setup.py @@ -25,9 +25,6 @@ 'scipy>=0.19.1', 'sklearn-pandas>=1.5.0', 'sqlalchemy>=1.1.14', -] - -api_requires = [ 'flask>=1.0.2', 'flask-restless>=0.17.0', 'flask-sqlalchemy>=2.3.2', @@ -98,8 +95,7 @@ ] }, extras_require={ - 'api': api_requires, - 'dev': api_requires + development_requires + tests_require, + 'dev': development_requires + tests_require, 'tests': tests_require, }, include_package_data=True,