Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Predefined instances for AWS EC2 #14

Merged
merged 7 commits into from
Jul 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,22 @@ print(sco)
# spark.sql.shuffle.partitions: 600
```

### Predefined Instance

You can use predefined `Instance` class.
Currently supports AWS EC2 instance type.

```python
from scopt.instances.aws import AwsInstanceMap

mapping = AwsInstanceMap()

mapping['r5.4xlarge']
# Instance(num_cores=16, memory_size=120)
mapping['p3.8xlarge']
# Instance(num_cores=4, memory_size=236)
```

### Set properties to SparkConf

You can set properties to SparkConf directory via `as_list` method.
Expand Down
4 changes: 4 additions & 0 deletions docs/update_aws_instance_mapping.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Update AWS instance mapping

Run python script `tools/scrape_ec2_config.py`, then print results as dictionary to stdout.
Copy and paste to `_instance_dict` property of `AwsInstanceMap`.
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ test =
build =
wheel
twine
beautifulsoup4

[options.packages.find]
where = src
Expand Down
229 changes: 229 additions & 0 deletions src/scopt/instances/aws.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
from typing import Dict

from scopt.instances import Instance

# EMR instance types.
# If you want to add new instance type, check following url to confirm how many
# memory can use for a spark executor.
# yarn.nodemanager.resource.memory-mb is maximum value for one executor.
# https://docs.aws.amazon.com/ja_jp/emr/latest/ReleaseGuide/emr-hadoop-task-config.html


class AwsInstanceMap:
"""Map of predefined Instance for AWS EC2


```python
from scopt.instances.aws import AwsInstanceMap


>>> mapping = AwsInstanceMap()
>>> print(mapping['r5.4xlarge'])

Instance(num_cores=16, memory_size=120)
```
"""

def __getitem__(self, key: str) -> Instance:
return self._instance_dict[key]

@property
def _instance_dict(self) -> Dict[str, Instance]:
return {
'c4.large': Instance(2, 1),
'c4.xlarge': Instance(4, 5),
'c4.2xlarge': Instance(8, 11),
'c4.4xlarge': Instance(16, 22),
'c4.8xlarge': Instance(36, 52),
'c5.xlarge': Instance(4, 6),
'c5.2xlarge': Instance(8, 12),
'c5.4xlarge': Instance(16, 24),
'c5.9xlarge': Instance(36, 64),
'c5.12xlarge': Instance(48, 88),
'c5.18xlarge': Instance(72, 136),
'c5.24xlarge': Instance(96, 184),
'c5a.xlarge': Instance(4, 5),
'c5a.2xlarge': Instance(8, 11),
'c5a.4xlarge': Instance(16, 22),
'c5a.8xlarge': Instance(32, 53),
'c5a.12xlarge': Instance(48, 88),
'c5a.16xlarge': Instance(64, 114),
'c5a.24xlarge': Instance(96, 175),
'c5ad.xlarge': Instance(4, 5),
'c5ad.2xlarge': Instance(8, 11),
'c5ad.4xlarge': Instance(16, 22),
'c5ad.8xlarge': Instance(32, 53),
'c5ad.12xlarge': Instance(48, 83),
'c5ad.16xlarge': Instance(64, 114),
'c5ad.24xlarge': Instance(96, 175),
'c5d.xlarge': Instance(4, 6),
'c5d.2xlarge': Instance(8, 12),
'c5d.4xlarge': Instance(16, 24),
'c5d.9xlarge': Instance(36, 64),
'c5d.18xlarge': Instance(72, 136),
'c5n.xlarge': Instance(4, 7),
'c5n.2xlarge': Instance(8, 15),
'c5n.4xlarge': Instance(16, 34),
'c5n.9xlarge': Instance(36, 88),
'c5n.18xlarge': Instance(72, 184),
'c6g.xlarge': Instance(4, 5),
'c6g.2xlarge': Instance(8, 11),
'c6g.4xlarge': Instance(16, 22),
'c6g.8xlarge': Instance(32, 53),
'c6g.12xlarge': Instance(48, 83),
'c6g.16xlarge': Instance(64, 114),
'c6gd.xlarge': Instance(4, 5),
'c6gd.2xlarge': Instance(8, 11),
'c6gd.4xlarge': Instance(16, 22),
'c6gd.8xlarge': Instance(32, 53),
'c6gd.12xlarge': Instance(48, 83),
'c6gd.16xlarge': Instance(64, 114),
'c6gn.xlarge': Instance(4, 5),
'c6gn.2xlarge': Instance(8, 11),
'c6gn.4xlarge': Instance(16, 22),
'c6gn.8xlarge': Instance(32, 53),
'c6gn.12xlarge': Instance(48, 83),
'c6gn.16xlarge': Instance(64, 114),
'd2.xlarge': Instance(4, 22),
'd2.2xlarge': Instance(8, 53),
'd2.4xlarge': Instance(16, 114),
'd2.8xlarge': Instance(36, 236),
'd3.xlarge': Instance(4, 22),
'd3.2xlarge': Instance(8, 53),
'd3.4xlarge': Instance(16, 114),
'd3.8xlarge': Instance(32, 236),
'd3en.xlarge': Instance(4, 11),
'd3en.2xlarge': Instance(8, 22),
'd3en.4xlarge': Instance(16, 53),
'd3en.6xlarge': Instance(24, 83),
'd3en.8xlarge': Instance(32, 114),
'd3en.12xlarge': Instance(48, 175),
'g3.4xlarge': Instance(1, 114),
'g3.8xlarge': Instance(2, 236),
'g3.16xlarge': Instance(4, 480),
'g3s.xlarge': Instance(1, 22),
'g4dn.xlarge': Instance(1, 12),
'g4dn.2xlarge': Instance(1, 24),
'g4dn.4xlarge': Instance(1, 56),
'g4dn.8xlarge': Instance(1, 120),
'g4dn.12xlarge': Instance(4, 184),
'g4dn.16xlarge': Instance(1, 248),
'i3.xlarge': Instance(4, 22),
'i3.2xlarge': Instance(8, 53),
'i3.4xlarge': Instance(16, 114),
'i3.8xlarge': Instance(32, 236),
'i3.16xlarge': Instance(64, 480),
'i3en.xlarge': Instance(4, 24),
'i3en.2xlarge': Instance(8, 56),
'i3en.3xlarge': Instance(12, 88),
'i3en.6xlarge': Instance(24, 184),
'i3en.12xlarge': Instance(48, 376),
'i3en.24xlarge': Instance(96, 760),
'm4.large': Instance(2, 6),
'm4.xlarge': Instance(4, 12),
'm4.2xlarge': Instance(8, 24),
'm4.4xlarge': Instance(16, 56),
'm4.10xlarge': Instance(40, 152),
'm4.16xlarge': Instance(64, 248),
'm5.xlarge': Instance(4, 12),
'm5.2xlarge': Instance(8, 24),
'm5.4xlarge': Instance(16, 56),
'm5.8xlarge': Instance(32, 120),
'm5.12xlarge': Instance(48, 184),
'm5.16xlarge': Instance(64, 248),
'm5.24xlarge': Instance(96, 376),
'm5a.xlarge': Instance(4, 12),
'm5a.2xlarge': Instance(8, 24),
'm5a.4xlarge': Instance(16, 56),
'm5a.8xlarge': Instance(32, 120),
'm5a.12xlarge': Instance(48, 184),
'm5a.16xlarge': Instance(64, 248),
'm5a.24xlarge': Instance(96, 376),
'm5d.xlarge': Instance(4, 12),
'm5d.2xlarge': Instance(8, 24),
'm5d.4xlarge': Instance(16, 56),
'm5d.8xlarge': Instance(32, 120),
'm5d.12xlarge': Instance(48, 184),
'm5d.16xlarge': Instance(64, 248),
'm5d.24xlarge': Instance(96, 376),
'm5zn.xlarge': Instance(4, 11),
'm5zn.2xlarge': Instance(8, 11),
'm5zn.3xlarge': Instance(12, 37),
'm5zn.6xlarge': Instance(24, 83),
'm5zn.12xlarge': Instance(48, 175),
'm6g.xlarge': Instance(4, 11),
'm6g.2xlarge': Instance(8, 22),
'm6g.4xlarge': Instance(16, 53),
'm6g.8xlarge': Instance(32, 114),
'm6g.12xlarge': Instance(48, 177),
'm6g.16xlarge': Instance(64, 236),
'm6gd.xlarge': Instance(4, 11),
'm6gd.2xlarge': Instance(8, 22),
'm6gd.4xlarge': Instance(16, 53),
'm6gd.8xlarge': Instance(32, 114),
'm6gd.12xlarge': Instance(48, 177),
'm6gd.16xlarge': Instance(64, 236),
'p2.xlarge': Instance(1, 53),
'p2.8xlarge': Instance(8, 480),
'p2.16xlarge': Instance(16, 724),
'p3.2xlarge': Instance(1, 53),
'p3.8xlarge': Instance(4, 236),
'p3.16xlarge': Instance(8, 480),
'r4.xlarge': Instance(4, 22),
'r4.2xlarge': Instance(8, 53),
'r4.4xlarge': Instance(16, 114),
'r4.8xlarge': Instance(32, 236),
'r4.16xlarge': Instance(64, 480),
'r5.xlarge': Instance(4, 24),
'r5.2xlarge': Instance(8, 56),
'r5.4xlarge': Instance(16, 120),
'r5.8xlarge': Instance(32, 248),
'r5.12xlarge': Instance(48, 376),
'r5.16xlarge': Instance(64, 504),
'r5.24xlarge': Instance(96, 760),
'r5a.xlarge': Instance(4, 24),
'r5a.2xlarge': Instance(8, 56),
'r5a.4xlarge': Instance(16, 120),
'r5a.8xlarge': Instance(32, 248),
'r5a.12xlarge': Instance(48, 376),
'r5a.16xlarge': Instance(64, 504),
'r5a.24xlarge': Instance(96, 760),
'r5b.xlarge': Instance(4, 22),
'r5b.2xlarge': Instance(8, 53),
'r5b.4xlarge': Instance(16, 114),
'r5b.8xlarge': Instance(32, 236),
'r5b.12xlarge': Instance(48, 358),
'r5b.16xlarge': Instance(64, 480),
'r5b.24xlarge': Instance(96, 724),
'r5d.xlarge': Instance(4, 24),
'r5d.2xlarge': Instance(8, 56),
'r5d.4xlarge': Instance(16, 120),
'r5d.8xlarge': Instance(32, 248),
'r5d.12xlarge': Instance(48, 376),
'r5d.16xlarge': Instance(64, 504),
'r5d.24xlarge': Instance(96, 760),
'r5dn.xlarge': Instance(4, 22),
'r5dn.2xlarge': Instance(8, 53),
'r5dn.4xlarge': Instance(16, 114),
'r5dn.8xlarge': Instance(32, 236),
'r5dn.12xlarge': Instance(48, 358),
'r5dn.16xlarge': Instance(64, 480),
'r5dn.24xlarge': Instance(96, 724),
'r6g.xlarge': Instance(4, 22),
'r6g.2xlarge': Instance(8, 53),
'r6g.4xlarge': Instance(16, 114),
'r6g.8xlarge': Instance(32, 236),
'r6g.12xlarge': Instance(48, 358),
'r6g.16xlarge': Instance(64, 480),
'r6gd.xlarge': Instance(4, 22),
'r6gd.2xlarge': Instance(8, 53),
'r6gd.4xlarge': Instance(16, 114),
'r6gd.8xlarge': Instance(32, 236),
'r6gd.12xlarge': Instance(48, 358),
'r6gd.16xlarge': Instance(64, 480),
'z1d.xlarge': Instance(4, 24),
'z1d.2xlarge': Instance(8, 56),
'z1d.3xlarge': Instance(12, 88),
'z1d.6xlarge': Instance(24, 184),
'z1d.12xlarge': Instance(48, 376),
}
25 changes: 25 additions & 0 deletions tests/instances/test_aws.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest

from scopt.instances import Instance
from scopt.instances.aws import AwsInstanceMap


class TestAwsInstanceMap:
def test_getitem(self) -> None:
mapping = AwsInstanceMap()
key, value = tuple(mapping._instance_dict.items())[0]
assert mapping[key] == value

def test_invalid_item_key(self) -> None:
mapping = AwsInstanceMap()
with pytest.raises(KeyError):
mapping['not_exist']

def test_immutability(self) -> None:
mapping = AwsInstanceMap()
with pytest.raises(TypeError):
mapping['dummy'] = Instance(1, 1)

def test_num_support_instances(self) -> None:
mapping = AwsInstanceMap()
assert len(mapping._instance_dict) == 196
83 changes: 83 additions & 0 deletions tools/scrape_ec2_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import math
from dataclasses import dataclass
from typing import List, Optional

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag


@dataclass
class InstanceMemoryInfo:
name: str
memory: int


@dataclass
class InstanceInfo:
name: str
core: int
memory: int

def as_dict_element(self) -> str:
return f'\'{self.name}\': Instance({self.core}, {self.memory})' # noqa: E501


def scrape(core: BeautifulSoup, memory: BeautifulSoup) -> List[InstanceInfo]:
def parse_core(memory_info: InstanceMemoryInfo) -> Optional[InstanceInfo]:
name = memory_info.name
memory = memory_info.memory
for row in core.find_all('tr'):
if name in str(row):
num_cores = int(row.find_all('td')[1].text)
return InstanceInfo(name, num_cores, memory)
return None

memory_tables = memory.find_all('div', attrs={'class': 'table-container'})
memory_info_list = [parse_memory_table(t) for t in memory_tables]
instance_info_list = [parse_core(m) for m in memory_info_list]
return [i for i in instance_info_list if i is not None]


def parse_memory_table(table: Tag) -> InstanceMemoryInfo:
name = table.find('div', attrs={'class': 'title'}).text
memory_row = extract_memory_row(table)
if memory_row is None:
raise AttributeError(
'Can not find `yarn.nodemanager.resource.memory-mb`'
)
memory_mb = extract_memory_mb(memory_row)
memory_gb = math.floor(memory_mb / 1024)
return InstanceMemoryInfo(name, memory_gb)


def extract_memory_row(table: Tag) -> Optional[Tag]:
for row in table.find_all('tr'):
if 'yarn.nodemanager.resource.memory-mb' in str(row):
return row
return None


def extract_memory_mb(row: Tag) -> int:
memory_mb = row.find_all('td')[1].text
return int(memory_mb)


def main() -> None:
r = requests.get(
'https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html' # noqa: E501
)
memory_soup = BeautifulSoup(r.text, 'html.parser')
r = requests.get('https://aws.amazon.com/ec2/instance-types/')
core_soup = BeautifulSoup(r.text, 'html.parser')
instance_info_list = scrape(core_soup, memory_soup)

print('{')
for info in instance_info_list:
if info:
print(f'{info.as_dict_element()},')
print('}')


if __name__ == '__main__':
main()