Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Update multimodal dep #75

Merged
merged 12 commits into from
Mar 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,7 @@ dependencies = [
"constructs >=10.0.0,<10.1.289",
"fsspec >=2023.5.0,<=2023.6.0",
"matplotlib >=3.4,<3.8",
"pandas >=1.4.1,<2.2.0",
"pydantic>=1.10.4,<2.0", # https://github.com/ray-project/ray/issues/36990
"pandas >=2.0.0,<2.2.0",
"pyyaml >=5.4,<7",
"ray[default] >=2.6.3,<2.7",
"s3fs >=2023.5.0,<=2023.6.0",
Expand Down
2 changes: 1 addition & 1 deletion sample_configs/dataloaders/text_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
logger = logging.getLogger(__name__)


class TextDataLoaer:
class TextDataLoader:
def __init__(
self,
dataset_name: str,
Expand Down
2 changes: 1 addition & 1 deletion sample_configs/dataloaders/vision_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def path_expander(path, base_folder):
logger = logging.getLogger(__name__)


class VisionDataLoaer:
class VisionDataLoader:
def __init__(self, dataset_name: str, dataset_config_file: str, split: str = "train"):
with open(dataset_config_file, "r") as f:
config = yaml.safe_load(f)
Expand Down
8 changes: 4 additions & 4 deletions sample_configs/multimodal_local_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@ mode: local # required
benchmark_name: ag_bench # required
root_dir: ag_bench_runs # optional, default = "ag_bench_runs"
# METRICS_BUCKET: autogluon-benchmark-metrics # optional, required only if you want to upload metrics to S3
constraint: 10m4x

# Multimodal specific
framework: AutoGluon_stable # required
dataset_name: # required
melbourne_airbnb
framework: AutoGluon_branch # required
dataset_name: clipart

#### Customizations ####
# custom_resource_dir: sample_configs/resources/ # path to custom multimodal_frameworks.yaml and multimodal_constraints.yaml
custom_resource_dir: sample_configs/resources/ # path to custom multimodal_frameworks.yaml and multimodal_constraints.yaml
# custom_dataloader:
# dataloader_file: sample_configs/dataloaders/vision_dataloader.py # relative path to WORKDIR
# class_name: VisionDataLoader
Expand Down
2 changes: 1 addition & 1 deletion sample_configs/resources/multimodal_constraints.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
10m4x:
TIME_LIMIT: 500
TIME_LIMIT: 100
INSTANCE: g4dn.4xlarge
# MAX_MACHINE_NUM: 20 # optional, default 20
# BLOCK_DEVICE_VOLUME: 100 # optional, default 100GB
Expand Down
6 changes: 3 additions & 3 deletions sample_configs/resources/multimodal_frameworks.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
AutoGluon_branch:
repo: https://github.com/autogluon/autogluon.git
version: stable_GA4_update
version: master
params: # MultimodalPredictor.fit(params)
presets: medium_quality
time_limit: 90
time_limit: 500
hyperparameters:
optimization.max_epochs: 1
optimization.max_epochs: 50
optimization.learning_rate: 0.005
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import itertools
import logging
import os
import re
import zipfile

import requests
Expand Down Expand Up @@ -228,7 +229,12 @@ def generate_config_combinations(config, metrics_bucket, batch_job_queue, batch_
config_s3_path = upload_config(config_list=job_configs, bucket=metrics_bucket, benchmark_name=benchmark_name)
env = [{"name": "config_file", "value": config_s3_path}]
job_type = "array" if len(job_configs) > 1 else "single"
job_name = f"{benchmark_name}-{config['module']}-{config['framework']}-{config['constraint']}-{job_type}-job"
constraint = config.get("amlb_constraint") or config.get("constraint")
job_name = f"{benchmark_name}-{config['module']}-{config['framework']}-{constraint}-{job_type}-job"
job_name = re.sub(r"(?![-_])\W", "-", job_name)[
:128
] # AWS Bath Job name can only contain letters, numbers, "-" and "_"

parent_job_id = submit_batch_job(
env=env,
job_name=job_name,
Expand Down
31 changes: 6 additions & 25 deletions src/autogluon/bench/datasets/multimodal_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self, split: str, dataset_name: str, data_info: dict):
try:
ext = os.path.splitext(data_info[split]["url"])[-1]
self._path = os.path.join(get_data_home_dir(), dataset_name, f"{split}{ext}")
download(data_info[split]["url"], path=self._path, sha1_hash=data_info[split]["sha1sum"])
download(data_info[split]["url"], path=self._path)
if ext == ".csv":
self._data = pd.read_csv(self._path)
elif ext == ".pq":
Expand Down Expand Up @@ -119,15 +119,14 @@ class Shopee(BaseImageDataset):
_INFO = {
"data": {
"url": get_repo_url() + "vision_datasets/shopee.zip",
"sha1sum": "59dffcfd0921cf0aa97215550dee3d1e3de656ca",
},
}
_registry_name = "shopee"

def __init__(self, split="train"):
self._split = split
self._path = os.path.join(get_data_home_dir(), "shopee")
load_zip.unzip(self._INFO["data"]["url"], unzip_dir=self._path, sha1sum=self._INFO["data"]["sha1sum"])
load_zip.unzip(self._INFO["data"]["url"], unzip_dir=self._path)
self._base_folder = os.path.join(self._path, "shopee")
try:
data_path = os.path.join(self._base_folder, f"{self._split}.csv")
Expand Down Expand Up @@ -175,15 +174,14 @@ class StanfordOnline(BaseMatcherDataset):
_INFO = {
"data": {
"url": get_repo_url() + "Stanford_Online_Products.zip",
"sha1sum": "4951af1dfcceb54b9b8f2126e995668e1b139cec",
},
}
_registry_name = "stanford_online"

def __init__(self, split="train"):
self._split = split
self._path = os.path.join(get_data_home_dir(), "Stanford_Online_Products")
load_zip.unzip(self._INFO["data"]["url"], unzip_dir=self._path, sha1sum=self._INFO["data"]["sha1sum"])
load_zip.unzip(self._INFO["data"]["url"], unzip_dir=self._path)
self._base_folder = os.path.join(self._path, "Stanford_Online_Products")
try:
self._data = pd.read_csv(os.path.join(self._base_folder, f"{self._split}.csv"), index_col=0)
Expand Down Expand Up @@ -229,7 +227,7 @@ def problem_type(self):
class Flickr30k(BaseMatcherDataset):
_SOURCE = "https://paperswithcode.com/dataset/flickr30k"
_INFO = {
"data": {"url": get_repo_url() + "flickr30k.zip", "sha1sum": "13d879429cff00022966324ab486d3317017d706"},
"data": {"url": get_repo_url() + "flickr30k.zip"},
}
_registry_name = "flickr30k"

Expand Down Expand Up @@ -293,17 +291,16 @@ class SNLI(BaseMatcherDataset):
_INFO = {
"train": {
"url": get_repo_url() + "snli/snli_train.csv",
"sha1sum": "2ebac97d99112f0817a0070dc48826f08ae2b42b",
},
"test": {"url": get_repo_url() + "snli/snli_test.csv", "sha1sum": "87d304ad75b3d64f0f58e316befc7aeba4729b8f"},
"test": {"url": get_repo_url() + "snli/snli_test.csv"},
}
_registry_name = "snli"

def __init__(self, split="train"):
self._split = split
self._path = os.path.join(get_data_home_dir(), "snli", f"{split}.csv")
try:
download(self._INFO[split]["url"], path=self._path, sha1_hash=self._INFO[split]["sha1sum"])
download(self._INFO[split]["url"], path=self._path)
self._data = pd.read_csv(self._path, delimiter="|")
except Exception:
logger.warn(f"The data split {self._split} is not available.")
Expand Down Expand Up @@ -344,11 +341,9 @@ class MitMovies(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "ner/mit-movies/train_v2.csv",
"sha1sum": "6732ddd21040ab8cd14418f4970af280b4b38a7a",
},
"test": {
"url": get_repo_url() + "ner/mit-movies/test_v2.csv",
"sha1sum": "99040f5f9d4990f62498ad0deeebc472a97e7885",
},
}
_registry_name = "mit_movies"
Expand Down Expand Up @@ -382,11 +377,9 @@ class WomenClothingReview(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "women_clothing_review/train.pq",
"sha1sum": "980023e4c063eae51adafc98482610a9a6a1878b",
},
"test": {
"url": get_repo_url() + "women_clothing_review/test.pq",
"sha1sum": "fbc84f757b8a08210a772613ca8342f3990eb1f7",
},
}
_registry_name = "women_clothing_review"
Expand Down Expand Up @@ -437,11 +430,9 @@ class MelBourneAirBnb(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "airbnb_melbourne/train.pq",
"sha1sum": "49f7d95df663d1199e6d860102d5863e48765caf",
},
"test": {
"url": get_repo_url() + "airbnb_melbourne/test.pq",
"sha1sum": "c28611514b659295fe4b345c3995005719499946",
},
}
_registry_name = "melbourne_airbnb"
Expand Down Expand Up @@ -506,11 +497,9 @@ class AEPricePrediction(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "ae_price_prediction/train.pq",
"sha1sum": "5b8a6327cc9429176d58af33ca3cc3480fe6c759",
},
"test": {
"url": get_repo_url() + "ae_price_prediction/test.pq",
"sha1sum": "7bebcaae48410386f610fd7a9c37ba0e89602858",
},
}
_registry_name = "ae_price_prediction"
Expand Down Expand Up @@ -556,11 +545,9 @@ class IMDBGenrePrediction(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "imdb_genre_prediction/train.csv",
"sha1sum": "56d2d5e3b19663d033fdfb6e33e4eb9c79c67864",
},
"test": {
"url": get_repo_url() + "imdb_genre_prediction/test.csv",
"sha1sum": "0e435e917159542d725d21135cfa514ae936d2c1",
},
}
_registry_name = "imdb_genre_prediction"
Expand Down Expand Up @@ -606,11 +593,9 @@ class JCPennyCategory(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "jc_penney_products/train.csv",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did we need SHA's earlier?
and why do we remove it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SHA is not a required field and was in place for sanity check. Since we might have to update datasets sometimes, it is required to update the sha as well which is too much hassle.

"sha1sum": "b59ce843ad05073a3fccf5ebc4840b3b0649f059",
},
"test": {
"url": get_repo_url() + "jc_penney_products/test.csv",
"sha1sum": "23bca284354deec13a11ef7bd726d35a01eb1332",
},
}
_registry_name = "jc_penney_products"
Expand Down Expand Up @@ -656,11 +641,9 @@ class NewsPopularity(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "news_popularity2/train.csv",
"sha1sum": "390b15e77fa77a2722ce2d459a977034a9565f46",
},
"test": {
"url": get_repo_url() + "news_popularity2/test.csv",
"sha1sum": "297253bdca18f6aafbaee0262be430126c1f9044",
},
}
_registry_name = "news_popularity"
Expand Down Expand Up @@ -706,11 +689,9 @@ class NewsChannel(BaseMultiModalDataset):
_INFO = {
"train": {
"url": get_repo_url() + "news_channel/train.csv",
"sha1sum": "ab226210b6a878b449d01f33a195014c65c22311",
},
"test": {
"url": get_repo_url() + "news_channel/test.csv",
"sha1sum": "a71516784ce6e168bd9933e9ec50080f65cb05fd",
},
}
_registry_name = "news_channel"
Expand Down
4 changes: 1 addition & 3 deletions src/autogluon/bench/datasets/object_detection_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def __init__(self, split: str, dataset_name: str, data_info: dict):
split (str): Specifies the dataset split. It should be one of the following options: 'train', 'val', 'test'.
"""
self._path = os.path.join(get_data_home_dir(), dataset_name)
load_zip.unzip(data_info["data"]["url"], unzip_dir=self._path, sha1sum=data_info["data"]["sha1sum"])
load_zip.unzip(data_info["data"]["url"], unzip_dir=self._path)
self._base_folder = os.path.join(self._path, dataset_name)
self._data_path = os.path.join(self._base_folder, "Annotations", f"{split}_cocoformat.json")
if not os.path.exists(self._data_path):
Expand Down Expand Up @@ -54,7 +54,6 @@ class TinyMotorbike(BaseObjectDetectionDataset):
_INFO = {
"data": {
"url": get_repo_url() + "object_detection_dataset/tiny_motorbike_coco.zip",
"sha1sum": "45c883b2feb0721d6eef29055fa28fb46b6e5346",
},
}
_registry_name = "tiny_motorbike"
Expand All @@ -81,7 +80,6 @@ class Clipart(BaseObjectDetectionDataset):
_INFO = {
"data": {
"url": get_repo_url() + "few_shot_object_detection/clipart.zip",
"sha1sum": "d25b2f905da597d7857297ac8e3efe4555e0bf32",
},
}
_registry_name = "clipart"
Expand Down
33 changes: 32 additions & 1 deletion src/autogluon/bench/frameworks/multimodal/exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@
import json
import logging
import os
import random
import time
from datetime import datetime
from typing import Optional, Union

import numpy as np
from sklearn.model_selection import train_test_split

from autogluon.bench.datasets.dataset_registry import multimodal_dataset_registry
from autogluon.core.metrics import make_scorer
from autogluon.multimodal import MultiModalPredictor
Expand All @@ -28,6 +32,11 @@ def _flatten_dict(data):
return flattened


def set_seed(seed):
np.random.seed(seed)
random.seed(seed)


def get_args():
parser = argparse.ArgumentParser()

Expand All @@ -45,6 +54,7 @@ def get_args():
"--custom_dataloader", type=str, default=None, help="Custom dataloader to use in the benchmark."
)
parser.add_argument("--custom_metrics", type=str, default=None, help="Custom metrics to use in the benchmark.")
parser.add_argument("--time_limit", type=int, default=None, help="Time limit used to fit the predictor.")

args = parser.parse_args()
return args
Expand Down Expand Up @@ -149,6 +159,7 @@ def run(
params: Optional[dict] = None,
custom_dataloader: Optional[dict] = None,
custom_metrics: Optional[dict] = None,
time_limit: Optional[int] = None,
):
"""Runs the AutoGluon multimodal benchmark on a given dataset.

Expand Down Expand Up @@ -181,7 +192,16 @@ def run(
Returns:
None
"""
seed = params.get("seed", 42)
set_seed(seed)

train_data, val_data, test_data = load_dataset(dataset_name=dataset_name, custom_dataloader=custom_dataloader)
if test_data.data is None:
logger.warning("No test data found, splitting test data from train data")
train_set, test_set = train_test_split(train_data.data, test_size=0.2, random_state=seed)
train_data.data = train_set
test_data.data = test_set

try:
label_column = train_data.label_columns[0]
except (AttributeError, IndexError): # Object Detection does not have label columns
Expand Down Expand Up @@ -219,6 +239,12 @@ def run(

predictor = MultiModalPredictor(**predictor_args)

if time_limit is not None:
params["time_limit"] = time_limit
logger.warning(
f'params["time_limit"] is being overriden by time_limit specified in constraints.yaml. params["time_limit"] = {time_limit}'
)

fit_args = {"train_data": train_data.data, "tuning_data": val_data.data, **params}

utc_time = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S")
Expand Down Expand Up @@ -256,6 +282,9 @@ def run(
framework, version = framework, ag_version

metric_name = test_data.metric if metrics_func is None else metrics_func.name
primary_metric = metric_name[0] if isinstance(metric_name, list) else metric_name
result = scores[primary_metric]

if hasattr(train_data, "id"):
id = f"id/{train_data.id}"
else:
Expand All @@ -268,7 +297,8 @@ def run(
"version": version,
"fold": 0,
"type": predictor.problem_type,
"metric": metric_name,
"metric": primary_metric,
"result": result,
"utc": utc_time,
"training_duration": training_duration,
"predict_duration": predict_duration,
Expand Down Expand Up @@ -296,4 +326,5 @@ def run(
params=args.params,
custom_dataloader=args.custom_dataloader,
custom_metrics=args.custom_metrics,
time_limit=args.time_limit,
)
Loading
Loading