From 069c7eb8870d71b1c716120c7ca817a3460896f5 Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 7 Oct 2024 14:23:29 -0600 Subject: [PATCH 1/7] docs: Add draft Signed-off-by: Angel Luu --- adrs/001-estimator-api.md | 172 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100644 adrs/001-estimator-api.md diff --git a/adrs/001-estimator-api.md b/adrs/001-estimator-api.md new file mode 100644 index 0000000..98e02e7 --- /dev/null +++ b/adrs/001-estimator-api.md @@ -0,0 +1,172 @@ +--- +title: Resource Estimator API +--- + +- **Author(s)**: Angel Luu (@aluu317) +- **Signer(s)**: Praveen Jayachandran, Ashok Pon Kumar Sree Prakash @ashokponkumar, Chander Govindarajan @ChanderG +- **Date (YYYY-MM-DD)**: 2024-10-01 +- **Obsoletes ADRs**: N/A +- **Modified By ADRs**: N/A +- **Relevant Issues**: N/A + +## Problem Context + +Users of tuning/training stack currently have no way of estimate how much memory, time or cost it takes to run a training prior to training. They often hit OOM errors due to lack of memory. Users don't have enough information to make trade-off decisions on time vs. cost. Platform admins do not have any info to better schedule/pack jobs onto GPUs. + +In order to be useful, the capability of estimating resources must be exposed to tuning/training users. The primary user of this service include training users and platform admins. + +This ADR defines an API for a Resource Estimator service that provides an estimate of resource requirements for their training runs. + +## Impact Table + +| AI Functionality | Operational Functionality | +| ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | +| Tuning Stack | APIs | + +## Decision + +- We will expose the API as REST using Open API, ADR [ref](https://github.ibm.com/ai-foundation/ai-foundation/blob/44d1163689b1aa1ca8ab6b9c571b73e6d05b9a0b/docs/current/adr/003-service-layer.md#decision). + +- The REST API definitions will be hosted as Open Source at the repo [fm-training-estimator](https://github.com/foundation-model-stack/fm-training-estimator). + +NOTE 1: We use REST API to mean an HTTP protocol server that uses standard HTTP verbs and supports Content-Type: application/json at a minimum. Full RESTful practices may be more strict + +### REST API Alternatives +Allow kubernetes Custom Resource Definitions as API definitions. The Pros and Cons are discussed [here](https://github.ibm.com/ai-foundation/ai-foundation/blob/44d1163689b1aa1ca8ab6b9c571b73e6d05b9a0b/docs/current/adr/003-service-layer.md#rest-api-alternatives). + +It is noted that the Estimator service should support state (repeated calls for same base config, but slight tweaks). TODO: unsure? + +## Consequences +-------- template ---------- +Describe the resulting context, after applying the decision. All consequences should be listed here, not just the "positive" ones. A particular decision may have positive, negative, and neutral consequences, but all of them affect the team and project in the future. Be sure to include any impact on the platform's dependencies, technology choices, and Open Source community relationships. + +Key things to include in this section: + +- Impact on existing platform usage patterns, particularly any breaking changes +- Required changes in community relationships +- Expected changes in engineering work loads based on the decision (will this need a [research team and 5 years](https://xkcd.com/1425/)?) +- Changes to the supported ecosystems (introduction of new hardware, new runtime form-factor, etc...) +- Known risks of adopting this decision + +-------- end template ---------- + + +## High Level Design + +- The REST API takes an input defined as `EstimateInput` data class (not all fields are required). This includes a list of instances of `Config` data class which in turnsincludes different types of configs (hf training args `HFArguments`, fms-hf-tuning additional args `FMArguments`, data args `DataArguments`, infrastructure args `InfraArguments` and peft lora args `PeftLoraConfig`), and `EstimatorConfig` with metadata parameters: + +Example of an `EstimateInput` with all fields defined: +```json +{ + "estimator": { // EstimatorConfig + "base_data_path": "data.csv", + "method": "theory", // theory, learned, hybrid + "token_estimation_version": 0 + }, + "configs": [{ // list of [Config] + "hf_training": { // HFArguments + "output_dir": "./output" + }, + "fm": { // FMArguments + "base_model_path": "ibm-granite/granite-3b-code-base", + "flash_attention_v2": "false", + "lora_config": null, + "max_seq_length": 2048, + "block_size": 2048, + "data_config_file": "data_config.json", + "prompt_tuning_config": null, + "torch_dtype": "float32", + "technique": "full" + }, + "data": { // DataArguments + "te_approach": 0, + "dataset": null, + "dataset_text_field": "text", + "dataset_split": "test", + "dataset_config_name": null + }, + "infra": { // InfraArguments + "numGpusPerPod": 1, + "numPods": 1, + "gpu_memory_in_gb": 80, + "gpuModel": "A100" + }, + "peft_lora": { // PeftLoraConfig + "r": 4, + "lora_alpha": 8, + "lora_dropout": 0.1, + "target_modules": "[q_proj, v_proj]" + } + }] +} +``` + +- The API exposes 4 endpoints: + +Endpoint `/api/memory` returns a `MemoryEstimate` as a JSON response: +```json +{ + "memory": { // MemoryEstimate + "total_mem_estimate": "44.6 GiB", + "activation_memory": "34.7 GiB", + "gradient_memory": "2.5 GiB", + "model_memory": "2.5 GiB", + "optimizer_memory": "4.9 GiB", + "num_gpus": 2 + } +} +``` + +Endpoint `/api/time` returns a `TimeEstimate` as a JSON response: +```json +{ + "time": { // TimeEstimate + "time": "40s" + } +} +``` + +Endpoint `/api/tokens` returns a `TokensEstimate` as a JSON response: +```json +{ + "tokens": { // TokensEstimate + "tps": "5259.07373046875" + } +} +``` + +Endpoint `/api/cost` returns a `CostEstimate` as a JSON response: +```json +{ + "cost": { // CostEstimate + "usd": "" // todo: what is unit of cost? USD? + } +} +``` + +Endpoint `/api/estimate` returns a `Estimate` that include all 4 types of estimates above as a JSON response: +```json +{ + "estimate": { // Estimate + "memory_estimate": { // MemoryEstimate + "total_mem_estimate": "44.6 GiB", + "activation_memory": "34.7 GiB", + "gradient_memory": "2.5 GiB", + "model_memory": "2.5 GiB", + "optimizer_memory": "4.9 GiB", + "num_gpus": 2 + }, + "time": { // TimeEstimate + "time": "40s" + }, + "tokens": { // TokensEstimate + "tps": "5259.07373046875" + }, + "cost": { // CostEstimate + "usd": "" // todo: what is unit of cost? USD? + } + } +} +``` + +- When more than 1 set of config is passed into the `EstimateInput`, the resulting estimate is an aggregated estimate of the job configs. TODO: unsure, is this supposed to mean total amount of time, memory, etc. Should it give a suggestion on order of jobs? How do we define job, job id? \ No newline at end of file From 5c3301333e9508ae6b8065133b8ce78fe54b4f85 Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 14 Oct 2024 17:07:31 -0600 Subject: [PATCH 2/7] feat: Add dataclasses for request and responses Signed-off-by: Angel Luu --- adrs/001-estimator-api.md | 2 +- fm_training_estimator/config/arguments.py | 55 ++++++++++++++++++++++- fm_training_estimator/ui/api.py | 19 +++++++- fm_training_estimator/ui/core.py | 47 ++++++++++++++++++- 4 files changed, 118 insertions(+), 5 deletions(-) diff --git a/adrs/001-estimator-api.md b/adrs/001-estimator-api.md index 98e02e7..dde8489 100644 --- a/adrs/001-estimator-api.md +++ b/adrs/001-estimator-api.md @@ -58,7 +58,7 @@ Key things to include in this section: Example of an `EstimateInput` with all fields defined: ```json { - "estimator": { // EstimatorConfig + "estimator": { // EstimatorMetadata "base_data_path": "data.csv", "method": "theory", // theory, learned, hybrid "token_estimation_version": 0 diff --git a/fm_training_estimator/config/arguments.py b/fm_training_estimator/config/arguments.py index 146ee84..6d7d726 100644 --- a/fm_training_estimator/config/arguments.py +++ b/fm_training_estimator/config/arguments.py @@ -1,5 +1,7 @@ # Standard from dataclasses import dataclass, field +from enum import Enum +from typing import List, Optional # Third Party from peft.tuners.lora import LoraConfig @@ -65,6 +67,11 @@ class InfraArguments: ) +class TuningTechnique(Enum): + LORA = "lora" + FULL = "full" + + @dataclass class FMArguments: """dataclass to store additional args not covered by standard HF argument dataclasses""" @@ -116,8 +123,9 @@ class FMArguments: }, ) - technique: str = field( - default="full", metadata={"help": ("Fine-tuning technique being used")} + technique: TuningTechnique = field( + default=TuningTechnique.FULL, + metadata={"help": ("Fine-tuning technique being used")}, ) @@ -144,3 +152,46 @@ class DataArguments: default=None, metadata={"help": ("dataset configuration to use, in case of HF dataset")}, ) + + +class EstimatorMethod(Enum): + THEORY = "theory" + LEARNED = "learned" + HYBRID = "hybrid" + + +@dataclass +class EstimatorMetadata: + base_data_path: str + method: List[EstimatorMethod] + token_estimation_version: str + + +@dataclass +class JobConfig: + hf_training: HFTrainingArguments = field(default_factory=HFTrainingArguments) + fm: FMArguments = field(default_factory=FMArguments) + data: DataArguments = field(default_factory=DataArguments) + infra: InfraArguments = field(default_factory=InfraArguments) + peft_lora: PeftLoraConfig = field(default_factory=PeftLoraConfig) + + +@dataclass +class EstimateRequest: + job_configs: List[JobConfig] + estimator_metadata: Optional[EstimatorMetadata] = None + + +@dataclass +class TimeEstimateResponse: + time: str + + +@dataclass +class MemoryEstimateResponse: + total_mem_estimate: str + activation_memory: str + gradient_memory: str + model_memory: str + optimizer_memory: str + num_gpus: int diff --git a/fm_training_estimator/ui/api.py b/fm_training_estimator/ui/api.py index 1bdd962..21cd528 100644 --- a/fm_training_estimator/ui/api.py +++ b/fm_training_estimator/ui/api.py @@ -8,8 +8,15 @@ import fire import uvicorn +# First Party +from fm_training_estimator.config.arguments import ( + EstimateRequest, + MemoryEstimateResponse, + TimeEstimateResponse, +) + # Local -from .core import run +from .core import estimate_memory, estimate_time, run def api(data_path, model_path): @@ -23,6 +30,16 @@ def estimate(config: Any = Body()): # types present in the output json which don't serialize out of the box return json.dumps(output, default=float) + @app.post("/api/time", response_model=TimeEstimateResponse) + def time(request: EstimateRequest): + conf = request.job_configs[0] + return estimate_time(conf, data_path, model_path) + + @app.post("/api/memory", response_model=MemoryEstimateResponse) + def memory(request: EstimateRequest): + conf = request.job_configs[0] + return estimate_memory(conf, data_path, model_path) + return app diff --git a/fm_training_estimator/ui/core.py b/fm_training_estimator/ui/core.py index 51d1449..cdbbfee 100644 --- a/fm_training_estimator/ui/core.py +++ b/fm_training_estimator/ui/core.py @@ -1,3 +1,11 @@ +# First Party +from fm_training_estimator.config.arguments import ( + JobConfig, + MemoryEstimateResponse, + TimeEstimateResponse, + TuningTechnique, +) + # Local from ..config import is_fsdp, parse from ..memory import HybridEstimator, HybridLoraEstimator @@ -6,12 +14,49 @@ from ..utils import fmt_size +def estimate_time(config: JobConfig, lookup_data_path=None, model_path=None): + + return TimeEstimateResponse(time="to be implemented") + + +def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): + if config.fm.technique == TuningTechnique.LORA: + est = HybridLoraEstimator( + config.fm, + config.hf_training, + config.infra, + config.peft_lora, + lookup_data_path, + model_path, + ) + else: + est = HybridEstimator( + config.fm, config.hf_training, config.infra, lookup_data_path, model_path + ) + + total_mem_estimate = fmt_size(float(est.get_total_mem_estimate())) + activation_memory = fmt_size(float(est.calculate_activation_memory())) + gradient_memory = fmt_size(float(est.calculate_gradient_memory())) + model_memory = fmt_size(float(est.calculate_model_memory())) + optimizer_memory = fmt_size(float(est.calculate_optimizer_memory())) + num_gpus = config.infra.numGpusPerPod + + return MemoryEstimateResponse( + total_mem_estimate, + activation_memory, + gradient_memory, + model_memory, + optimizer_memory, + num_gpus, + ) + + def run(config, lookup_data_path=None, model_path=None): res = {} fm, ta, ia, da, la = parse(config) - if fm.technique == "lora": + if config.fm.technique == "lora": est = HybridLoraEstimator(fm, ta, ia, la, lookup_data_path, model_path) else: est = HybridEstimator(fm, ta, ia, lookup_data_path, model_path) From 7c2c64ebb6b0e03840204f60c2f9dff2e503b40a Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Tue, 15 Oct 2024 15:46:54 -0600 Subject: [PATCH 3/7] feat: Implement the time estimate endpoint Signed-off-by: Angel Luu --- fm_training_estimator/ui/core.py | 49 +++++++++++++++++++++++++++++++- 1 file changed, 48 insertions(+), 1 deletion(-) diff --git a/fm_training_estimator/ui/core.py b/fm_training_estimator/ui/core.py index cdbbfee..72eb99d 100644 --- a/fm_training_estimator/ui/core.py +++ b/fm_training_estimator/ui/core.py @@ -1,3 +1,6 @@ +# Third Party +from fastapi import HTTPException + # First Party from fm_training_estimator.config.arguments import ( JobConfig, @@ -15,8 +18,36 @@ def estimate_time(config: JobConfig, lookup_data_path=None, model_path=None): + token_est = None + if config.data.te_approach == 0: + token_est = TokenEstimator0(config.data) + + speed_est = HybridSpeedEstimator( + config.fm, config.hf_training, config.infra, lookup_data_path, model_path + ) + # res["tps"] = float(speed_est.get_tps()) + + time = "" + if token_est is not None: + tokens_per_sample = int( + token_est.get_estimated_batch_width( + config.hf_training.per_device_train_batch_size + ) + ) + total_tokens = int(token_est.get_total_tokens()) + + # get the update tps for this estimate token width + tps = float(speed_est.get_tps(tokens_per_sample)) - return TimeEstimateResponse(time="to be implemented") + time = total_tokens / tps + + if not time: + raise HTTPException( + status_code=501, + detail="This te_approach is not implemented or has been disabled", + ) + + return TimeEstimateResponse(time) def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): @@ -41,6 +72,22 @@ def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): optimizer_memory = fmt_size(float(est.calculate_optimizer_memory())) num_gpus = config.infra.numGpusPerPod + if num_gpus == 0: + if config.fm.technique == TuningTechnique.FULL and is_fsdp(config.hf_training): + num_gpus = est.fsdp_est.get_number_of_gpus() + elif config.fm.technique == TuningTechnique.LORA: + num_gpus = est.num_gpus + else: + num_gpus = 1 + + config.infra.numGpusPerPod = num_gpus + + # No suitable configuration found + if num_gpus == -1: + raise HTTPException( + status_code=422, detail="Input configuration is infeasible!" + ) + return MemoryEstimateResponse( total_mem_estimate, activation_memory, From f9a7d6b8eb46860447b36ea344b990539bef4625 Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 21 Oct 2024 15:20:11 -0600 Subject: [PATCH 4/7] Revert "feat: Implement the time estimate endpoint" This reverts commit 046a295ffe676951025e42845b1fe356cdbe59d7. Signed-off-by: Angel Luu --- fm_training_estimator/ui/core.py | 49 +------------------------------- 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/fm_training_estimator/ui/core.py b/fm_training_estimator/ui/core.py index 72eb99d..cdbbfee 100644 --- a/fm_training_estimator/ui/core.py +++ b/fm_training_estimator/ui/core.py @@ -1,6 +1,3 @@ -# Third Party -from fastapi import HTTPException - # First Party from fm_training_estimator.config.arguments import ( JobConfig, @@ -18,36 +15,8 @@ def estimate_time(config: JobConfig, lookup_data_path=None, model_path=None): - token_est = None - if config.data.te_approach == 0: - token_est = TokenEstimator0(config.data) - - speed_est = HybridSpeedEstimator( - config.fm, config.hf_training, config.infra, lookup_data_path, model_path - ) - # res["tps"] = float(speed_est.get_tps()) - - time = "" - if token_est is not None: - tokens_per_sample = int( - token_est.get_estimated_batch_width( - config.hf_training.per_device_train_batch_size - ) - ) - total_tokens = int(token_est.get_total_tokens()) - - # get the update tps for this estimate token width - tps = float(speed_est.get_tps(tokens_per_sample)) - time = total_tokens / tps - - if not time: - raise HTTPException( - status_code=501, - detail="This te_approach is not implemented or has been disabled", - ) - - return TimeEstimateResponse(time) + return TimeEstimateResponse(time="to be implemented") def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): @@ -72,22 +41,6 @@ def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): optimizer_memory = fmt_size(float(est.calculate_optimizer_memory())) num_gpus = config.infra.numGpusPerPod - if num_gpus == 0: - if config.fm.technique == TuningTechnique.FULL and is_fsdp(config.hf_training): - num_gpus = est.fsdp_est.get_number_of_gpus() - elif config.fm.technique == TuningTechnique.LORA: - num_gpus = est.num_gpus - else: - num_gpus = 1 - - config.infra.numGpusPerPod = num_gpus - - # No suitable configuration found - if num_gpus == -1: - raise HTTPException( - status_code=422, detail="Input configuration is infeasible!" - ) - return MemoryEstimateResponse( total_mem_estimate, activation_memory, From 56187c6a1169af0e68bcff6a898c147fd72f78c9 Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 21 Oct 2024 15:21:22 -0600 Subject: [PATCH 5/7] docs: update adr to use a python library instead of REST server/docker image Signed-off-by: Angel Luu --- adrs/001-estimator-api.md | 172 ------------------------- adrs/001-resource-estimator-library.md | 161 +++++++++++++++++++++++ 2 files changed, 161 insertions(+), 172 deletions(-) delete mode 100644 adrs/001-estimator-api.md create mode 100644 adrs/001-resource-estimator-library.md diff --git a/adrs/001-estimator-api.md b/adrs/001-estimator-api.md deleted file mode 100644 index dde8489..0000000 --- a/adrs/001-estimator-api.md +++ /dev/null @@ -1,172 +0,0 @@ ---- -title: Resource Estimator API ---- - -- **Author(s)**: Angel Luu (@aluu317) -- **Signer(s)**: Praveen Jayachandran, Ashok Pon Kumar Sree Prakash @ashokponkumar, Chander Govindarajan @ChanderG -- **Date (YYYY-MM-DD)**: 2024-10-01 -- **Obsoletes ADRs**: N/A -- **Modified By ADRs**: N/A -- **Relevant Issues**: N/A - -## Problem Context - -Users of tuning/training stack currently have no way of estimate how much memory, time or cost it takes to run a training prior to training. They often hit OOM errors due to lack of memory. Users don't have enough information to make trade-off decisions on time vs. cost. Platform admins do not have any info to better schedule/pack jobs onto GPUs. - -In order to be useful, the capability of estimating resources must be exposed to tuning/training users. The primary user of this service include training users and platform admins. - -This ADR defines an API for a Resource Estimator service that provides an estimate of resource requirements for their training runs. - -## Impact Table - -| AI Functionality | Operational Functionality | -| ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | -| Tuning Stack | APIs | - -## Decision - -- We will expose the API as REST using Open API, ADR [ref](https://github.ibm.com/ai-foundation/ai-foundation/blob/44d1163689b1aa1ca8ab6b9c571b73e6d05b9a0b/docs/current/adr/003-service-layer.md#decision). - -- The REST API definitions will be hosted as Open Source at the repo [fm-training-estimator](https://github.com/foundation-model-stack/fm-training-estimator). - -NOTE 1: We use REST API to mean an HTTP protocol server that uses standard HTTP verbs and supports Content-Type: application/json at a minimum. Full RESTful practices may be more strict - -### REST API Alternatives -Allow kubernetes Custom Resource Definitions as API definitions. The Pros and Cons are discussed [here](https://github.ibm.com/ai-foundation/ai-foundation/blob/44d1163689b1aa1ca8ab6b9c571b73e6d05b9a0b/docs/current/adr/003-service-layer.md#rest-api-alternatives). - -It is noted that the Estimator service should support state (repeated calls for same base config, but slight tweaks). TODO: unsure? - -## Consequences --------- template ---------- -Describe the resulting context, after applying the decision. All consequences should be listed here, not just the "positive" ones. A particular decision may have positive, negative, and neutral consequences, but all of them affect the team and project in the future. Be sure to include any impact on the platform's dependencies, technology choices, and Open Source community relationships. - -Key things to include in this section: - -- Impact on existing platform usage patterns, particularly any breaking changes -- Required changes in community relationships -- Expected changes in engineering work loads based on the decision (will this need a [research team and 5 years](https://xkcd.com/1425/)?) -- Changes to the supported ecosystems (introduction of new hardware, new runtime form-factor, etc...) -- Known risks of adopting this decision - --------- end template ---------- - - -## High Level Design - -- The REST API takes an input defined as `EstimateInput` data class (not all fields are required). This includes a list of instances of `Config` data class which in turnsincludes different types of configs (hf training args `HFArguments`, fms-hf-tuning additional args `FMArguments`, data args `DataArguments`, infrastructure args `InfraArguments` and peft lora args `PeftLoraConfig`), and `EstimatorConfig` with metadata parameters: - -Example of an `EstimateInput` with all fields defined: -```json -{ - "estimator": { // EstimatorMetadata - "base_data_path": "data.csv", - "method": "theory", // theory, learned, hybrid - "token_estimation_version": 0 - }, - "configs": [{ // list of [Config] - "hf_training": { // HFArguments - "output_dir": "./output" - }, - "fm": { // FMArguments - "base_model_path": "ibm-granite/granite-3b-code-base", - "flash_attention_v2": "false", - "lora_config": null, - "max_seq_length": 2048, - "block_size": 2048, - "data_config_file": "data_config.json", - "prompt_tuning_config": null, - "torch_dtype": "float32", - "technique": "full" - }, - "data": { // DataArguments - "te_approach": 0, - "dataset": null, - "dataset_text_field": "text", - "dataset_split": "test", - "dataset_config_name": null - }, - "infra": { // InfraArguments - "numGpusPerPod": 1, - "numPods": 1, - "gpu_memory_in_gb": 80, - "gpuModel": "A100" - }, - "peft_lora": { // PeftLoraConfig - "r": 4, - "lora_alpha": 8, - "lora_dropout": 0.1, - "target_modules": "[q_proj, v_proj]" - } - }] -} -``` - -- The API exposes 4 endpoints: - -Endpoint `/api/memory` returns a `MemoryEstimate` as a JSON response: -```json -{ - "memory": { // MemoryEstimate - "total_mem_estimate": "44.6 GiB", - "activation_memory": "34.7 GiB", - "gradient_memory": "2.5 GiB", - "model_memory": "2.5 GiB", - "optimizer_memory": "4.9 GiB", - "num_gpus": 2 - } -} -``` - -Endpoint `/api/time` returns a `TimeEstimate` as a JSON response: -```json -{ - "time": { // TimeEstimate - "time": "40s" - } -} -``` - -Endpoint `/api/tokens` returns a `TokensEstimate` as a JSON response: -```json -{ - "tokens": { // TokensEstimate - "tps": "5259.07373046875" - } -} -``` - -Endpoint `/api/cost` returns a `CostEstimate` as a JSON response: -```json -{ - "cost": { // CostEstimate - "usd": "" // todo: what is unit of cost? USD? - } -} -``` - -Endpoint `/api/estimate` returns a `Estimate` that include all 4 types of estimates above as a JSON response: -```json -{ - "estimate": { // Estimate - "memory_estimate": { // MemoryEstimate - "total_mem_estimate": "44.6 GiB", - "activation_memory": "34.7 GiB", - "gradient_memory": "2.5 GiB", - "model_memory": "2.5 GiB", - "optimizer_memory": "4.9 GiB", - "num_gpus": 2 - }, - "time": { // TimeEstimate - "time": "40s" - }, - "tokens": { // TokensEstimate - "tps": "5259.07373046875" - }, - "cost": { // CostEstimate - "usd": "" // todo: what is unit of cost? USD? - } - } -} -``` - -- When more than 1 set of config is passed into the `EstimateInput`, the resulting estimate is an aggregated estimate of the job configs. TODO: unsure, is this supposed to mean total amount of time, memory, etc. Should it give a suggestion on order of jobs? How do we define job, job id? \ No newline at end of file diff --git a/adrs/001-resource-estimator-library.md b/adrs/001-resource-estimator-library.md new file mode 100644 index 0000000..fdf08c8 --- /dev/null +++ b/adrs/001-resource-estimator-library.md @@ -0,0 +1,161 @@ +--- +title: Resource Estimator Library +--- + +- **Author(s)**: Angel Luu (@aluu317) +- **Signer(s)**: Praveen Jayachandran, Ashok Pon Kumar Sree Prakash @ashokponkumar, Chander Govindarajan @ChanderG +- **Date (YYYY-MM-DD)**: 2024-10-31 +- **Obsoletes ADRs**: N/A +- **Modified By ADRs**: N/A +- **Relevant Issues**: N/A + +## Problem Context + +Users of tuning/training stack currently have no way of estimating how much memory, time or cost it takes to run a training. They often hit OOM errors due to lack of memory. Users don't have enough information to make trade-off decisions on time vs. cost. Platform admins do not have any info to better schedule/pack jobs onto GPUs. + +In order to be useful, the capability of estimating resources must be exposed to tuning/training users. The primary user personas of this service include training users and platform admins. + +This ADR defines a Resource Estimator Python Library that provides an estimate of resource requirements for training runs. + +## Impact Table + +| AI Functionality | Operational Functionality | +| ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | +| Tuning Stack | APIs | + +## Decision + +- We will expose the resource estimator service as a Python library `fm_training_estimator`, hosted as Open Source at the repo [fm-training-estimator](https://github.com/foundation-model-stack/fm-training-estimator) and published to [PyPI](https://pypi.org/). +- This Python library can be installed and plugged into any UI backend or a docker image by a product team. +- The `fm_training_estimator` exposes 4 methods to calculate memory, time, tokens and cost. The method calls allows for user to pass training data as input for "learned" or "hybrid" model. If training data is missing, the "theory" is used. + +### Alternatives to Python library deliverable +We have considered choices of: +- Alternative 1: A new docker image which has a FastAPI Server with a REST interface defined. When a product team integrates as a service, they can run this docker image, a server will run on localhost which can then be queried by GET/POST calls to do the estimates. + +- Alternative 2: A new docker image with a python script similar to fms-hf-tuning, which accepts a JSON config and calls the necessary python scripts to get estimate and save results in a file. + +Both alternatives provide more value to consumers. However does not provide the flexibility of how the library can be integrated and consumed. + +## Consequences + +- By using this library, users need to supply their own dataset for the estimator to generate a learned model, and assume the security and privacy of that data. They can use flight service plugin should that be applicable. +- The library can be used as backend component of a larger UI effort, or as part of a Docker image. The product teams can consume the library however they see fit and create their own build/update process. + +## High Level Design + +- The `EstimateInput` data class (not all fields are required) defines the set of configs the library will use to calculate the results. This includes a list of instances of `Config` data class which in turns includes different types of configs (hf training args `HFArguments`, fms-hf-tuning additional args `FMArguments`, data args `DataArguments`, infrastructure args `InfraArguments` and peft lora args `PeftLoraConfig`), and `EstimatorConfig` with metadata parameters. The input can be read from a json file using `--input_file_path` or `-f`. + +Example of an `EstimateInput` with all fields defined: +```json +{ + "estimator": { // EstimatorMetadata + "base_data_path": "data.csv", + "method": "theory", // theory, learned, hybrid + "token_estimation_version": 0 + }, + "configs": [{ // list of [Config] + "hf_training": { // HFArguments + "output_dir": "./output" + }, + "fm": { // FMArguments + "base_model_path": "ibm-granite/granite-3b-code-base", + "flash_attention_v2": "false", + "lora_config": null, + "max_seq_length": 2048, + "block_size": 2048, + "data_config_file": "data_config.json", + "prompt_tuning_config": null, + "torch_dtype": "float32", + "technique": "full" + }, + "data": { // DataArguments + "te_approach": 0, + "dataset": null, + "dataset_text_field": "text", + "dataset_split": "test", + "dataset_config_name": null + }, + "infra": { // InfraArguments + "numGpusPerPod": 1, + "numPods": 1, + "gpu_memory_in_gb": 80, + "gpuModel": "A100" + }, + "peft_lora": { // PeftLoraConfig + "r": 4, + "lora_alpha": 8, + "lora_dropout": 0.1, + "target_modules": "[q_proj, v_proj]" + } + }] +} +``` + +- The API exposes 4 functions: + +Function `estimate_memory` returns a `MemoryEstimate`: +```python +{ + "memory": { # MemoryEstimate + "total_mem_estimate": "44.6 GiB", + "activation_memory": "34.7 GiB", + "gradient_memory": "2.5 GiB", + "model_memory": "2.5 GiB", + "optimizer_memory": "4.9 GiB", + "num_gpus": 2 + } +} +``` + +Function `estimate_time` returns a `TimeEstimate`: +```python +{ + "time": { # TimeEstimate + "time": "40s" + } +} +``` + +Function `estimate_tokens` returns a `TokensEstimate`: +```python +{ + "tokens": { # TokensEstimate + "tps": "5259.07373046875" + } +} +``` + +Function `estimate_cost` returns a `CostEstimate`: +```python +{ + "cost": { # CostEstimate + "usd": "1" + } +} +``` + +Function `estimate` returns a `Estimate` that include all 4 types of estimates above: +```python +{ + "estimate": { # Estimate + "memory_estimate": { # MemoryEstimate + "total_mem_estimate": "44.6 GiB", + "activation_memory": "34.7 GiB", + "gradient_memory": "2.5 GiB", + "model_memory": "2.5 GiB", + "optimizer_memory": "4.9 GiB", + "num_gpus": 2 + }, + "time": { # TimeEstimate + "time": "40s" + }, + "tokens": { # TokensEstimate + "tps": "5259.07373046875" + }, + "cost": { # CostEstimate + "usd": "1" + } + } +} +``` \ No newline at end of file From cc0fae94d1e3f1f92d8458c431c6ae2244d1caa7 Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 21 Oct 2024 15:28:10 -0600 Subject: [PATCH 6/7] chore: Revert code changes Signed-off-by: Angel Luu --- fm_training_estimator/ui/api.py | 19 +------------ fm_training_estimator/ui/core.py | 47 +------------------------------- 2 files changed, 2 insertions(+), 64 deletions(-) diff --git a/fm_training_estimator/ui/api.py b/fm_training_estimator/ui/api.py index 21cd528..1bdd962 100644 --- a/fm_training_estimator/ui/api.py +++ b/fm_training_estimator/ui/api.py @@ -8,15 +8,8 @@ import fire import uvicorn -# First Party -from fm_training_estimator.config.arguments import ( - EstimateRequest, - MemoryEstimateResponse, - TimeEstimateResponse, -) - # Local -from .core import estimate_memory, estimate_time, run +from .core import run def api(data_path, model_path): @@ -30,16 +23,6 @@ def estimate(config: Any = Body()): # types present in the output json which don't serialize out of the box return json.dumps(output, default=float) - @app.post("/api/time", response_model=TimeEstimateResponse) - def time(request: EstimateRequest): - conf = request.job_configs[0] - return estimate_time(conf, data_path, model_path) - - @app.post("/api/memory", response_model=MemoryEstimateResponse) - def memory(request: EstimateRequest): - conf = request.job_configs[0] - return estimate_memory(conf, data_path, model_path) - return app diff --git a/fm_training_estimator/ui/core.py b/fm_training_estimator/ui/core.py index cdbbfee..51d1449 100644 --- a/fm_training_estimator/ui/core.py +++ b/fm_training_estimator/ui/core.py @@ -1,11 +1,3 @@ -# First Party -from fm_training_estimator.config.arguments import ( - JobConfig, - MemoryEstimateResponse, - TimeEstimateResponse, - TuningTechnique, -) - # Local from ..config import is_fsdp, parse from ..memory import HybridEstimator, HybridLoraEstimator @@ -14,49 +6,12 @@ from ..utils import fmt_size -def estimate_time(config: JobConfig, lookup_data_path=None, model_path=None): - - return TimeEstimateResponse(time="to be implemented") - - -def estimate_memory(config: JobConfig, lookup_data_path=None, model_path=None): - if config.fm.technique == TuningTechnique.LORA: - est = HybridLoraEstimator( - config.fm, - config.hf_training, - config.infra, - config.peft_lora, - lookup_data_path, - model_path, - ) - else: - est = HybridEstimator( - config.fm, config.hf_training, config.infra, lookup_data_path, model_path - ) - - total_mem_estimate = fmt_size(float(est.get_total_mem_estimate())) - activation_memory = fmt_size(float(est.calculate_activation_memory())) - gradient_memory = fmt_size(float(est.calculate_gradient_memory())) - model_memory = fmt_size(float(est.calculate_model_memory())) - optimizer_memory = fmt_size(float(est.calculate_optimizer_memory())) - num_gpus = config.infra.numGpusPerPod - - return MemoryEstimateResponse( - total_mem_estimate, - activation_memory, - gradient_memory, - model_memory, - optimizer_memory, - num_gpus, - ) - - def run(config, lookup_data_path=None, model_path=None): res = {} fm, ta, ia, da, la = parse(config) - if config.fm.technique == "lora": + if fm.technique == "lora": est = HybridLoraEstimator(fm, ta, ia, la, lookup_data_path, model_path) else: est = HybridEstimator(fm, ta, ia, lookup_data_path, model_path) From fdb8608e3ada6886c565e16a7c1094f3c6f5d60b Mon Sep 17 00:00:00 2001 From: Angel Luu Date: Mon, 21 Oct 2024 16:03:36 -0600 Subject: [PATCH 7/7] chore: define dataclasses based on ADR Signed-off-by: Angel Luu --- adrs/001-resource-estimator-library.md | 8 +-- fm_training_estimator/config/arguments.py | 61 +++++++++++++++++++++-- 2 files changed, 60 insertions(+), 9 deletions(-) diff --git a/adrs/001-resource-estimator-library.md b/adrs/001-resource-estimator-library.md index fdf08c8..19a8872 100644 --- a/adrs/001-resource-estimator-library.md +++ b/adrs/001-resource-estimator-library.md @@ -54,7 +54,7 @@ Example of an `EstimateInput` with all fields defined: "method": "theory", // theory, learned, hybrid "token_estimation_version": 0 }, - "configs": [{ // list of [Config] + "job_configs": [{ // list of [JobConfig] "hf_training": { // HFArguments "output_dir": "./output" }, @@ -130,7 +130,7 @@ Function `estimate_cost` returns a `CostEstimate`: ```python { "cost": { # CostEstimate - "usd": "1" + "usd": "0.0" } } ``` @@ -139,7 +139,7 @@ Function `estimate` returns a `Estimate` that include all 4 types of estimates a ```python { "estimate": { # Estimate - "memory_estimate": { # MemoryEstimate + "memory": { # MemoryEstimate "total_mem_estimate": "44.6 GiB", "activation_memory": "34.7 GiB", "gradient_memory": "2.5 GiB", @@ -154,7 +154,7 @@ Function `estimate` returns a `Estimate` that include all 4 types of estimates a "tps": "5259.07373046875" }, "cost": { # CostEstimate - "usd": "1" + "usd": "0.0" } } } diff --git a/fm_training_estimator/config/arguments.py b/fm_training_estimator/config/arguments.py index 6d7d726..e903d94 100644 --- a/fm_training_estimator/config/arguments.py +++ b/fm_training_estimator/config/arguments.py @@ -11,7 +11,7 @@ @dataclass class PeftPromptTuningConfig(PromptTuningConfig): - """dataclass for promptuning config + """dataclass for prompt tuning config Args: PromptTuningConfig (_type_): imported directly from peft library @@ -20,7 +20,7 @@ class PeftPromptTuningConfig(PromptTuningConfig): @dataclass class PeftLoraConfig: - """Dataclass for lora config + """Dataclass for LoRA tuning config Not directly imported from peft LoraConfig due to complexity. """ @@ -68,8 +68,13 @@ class InfraArguments: class TuningTechnique(Enum): + """Enumerate different tuning techniques the FM Training Estimator can perform estimation on.""" + LORA = "lora" + """LoRA tuning technique.""" + FULL = "full" + """Full fine-tuning technique.""" @dataclass @@ -131,6 +136,8 @@ class FMArguments: @dataclass class DataArguments: + """dataclass to define args handling training data as input for estimation.""" + te_approach: int = field( default=0, metadata={"help": ("Approach to use for Token Estimation")} ) @@ -155,13 +162,22 @@ class DataArguments: class EstimatorMethod(Enum): + """Enumerate different estimation models the FM Training Estimator is to use to make an estimation.""" + THEORY = "theory" + """Theory model for estimation.""" + LEARNED = "learned" + """Learned model for estimation, based on user provided training data.""" + HYBRID = "hybrid" + """Hybrid model for estimation, a combination of theory and learned models.""" @dataclass class EstimatorMetadata: + """Metadata for the FM Training Estimator.""" + base_data_path: str method: List[EstimatorMethod] token_estimation_version: str @@ -169,6 +185,8 @@ class EstimatorMetadata: @dataclass class JobConfig: + """Dataclass that represents a set of different configs for a tuning job to make estimate on.""" + hf_training: HFTrainingArguments = field(default_factory=HFTrainingArguments) fm: FMArguments = field(default_factory=FMArguments) data: DataArguments = field(default_factory=DataArguments) @@ -177,21 +195,54 @@ class JobConfig: @dataclass -class EstimateRequest: +class EstimateInput: + """ + The dataclass that is an input to a estimate function. + It includes a list of different training job configs and metadata about the estimator. + """ + job_configs: List[JobConfig] estimator_metadata: Optional[EstimatorMetadata] = None @dataclass -class TimeEstimateResponse: +class TimeEstimate: + """The estimated time response to estimate_time function.""" + time: str @dataclass -class MemoryEstimateResponse: +class MemoryEstimate: + """The estimated memory response to estimate_memory function.""" + total_mem_estimate: str activation_memory: str gradient_memory: str model_memory: str optimizer_memory: str num_gpus: int + + +@dataclass +class TokenEstimate: + """The estimated token response to estimate_token function.""" + + tps: float + + +@dataclass +class CostEstimate: + """The estimated cost response to estimate_cost function.""" + + usd: float + + +@dataclass +class Estimate: + """The estimate response to estimate function, including time, memory, tokens and cost.""" + + memory: MemoryEstimate + time: TimeEstimate + tokens: TokenEstimate + cost: CostEstimate