From ed5c490eb82052c340c91292760c9b3355d6b174 Mon Sep 17 00:00:00 2001 From: Gabriele Castellano Date: Thu, 16 Jan 2025 03:33:53 +0100 Subject: [PATCH] describe behavior of -psn parameter for single plane topology generation (#66) --- .../topo/gen_HPN_7.0_topo_mulgpus_one_link.py | 2 +- docs/Tutorial.md | 21 ++++++++++++------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py b/astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py index e750e3b..34aa041 100755 --- a/astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py +++ b/astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py @@ -374,7 +374,7 @@ def main(): parser.add_argument('-g','--gpu',type=int,default=32,help='gpus num,default 32') parser.add_argument('-gt','--gpu_type',type=str,default='H800',help='gpu_type,default H800') parser.add_argument('-gps','--gpu_per_server',type=int,default=8,help='gpu_per_server,default 8') - parser.add_argument('-psn','--psw_switch_num',type=int,default=120,help='psw_switch_num,default 120') + parser.add_argument('-psn','--psw_switch_num',type=int,default=120,help='psw_switch_num (halved in single plane case),default 120') parser.add_argument('-asn','--asw_switch_num',type=int,default=8,help='asw_switch_num,default 8') parser.add_argument('-nsps','--nv_switch_per_server',type=int,default=1,help='nv_switch_per_server,default 1') parser.add_argument('-npa','--nics_per_aswitch',type=int,default=128,help='nnics per asw,default 128') diff --git a/docs/Tutorial.md b/docs/Tutorial.md index ae9b084..9e8996a 100755 --- a/docs/Tutorial.md +++ b/docs/Tutorial.md @@ -156,7 +156,7 @@ Using the same workload as SimAI-Analytical, generated by [SimAI-WorkloadGenerat Before running SimAI-Simulator, you need to generate a `topo` file that can be recognized by `ns-3-alibabacloud`. -As shown in the figure below, the first row represents various parameters: `node_num` is the total number of nodes, `gpus_per_server` refers to the number of GPUs per server (currently, we bind each NIC to a GPU as a single node), `nvswitch_num` indicates the number of NVSwitch nodes (specifically used to implement the NVLS algorithm), `switch_num` is the number of switches, `link_num` is the total number of connections, and `gpu_type_str` describes the type of GPU. +The figure below shows an example of a topology file; the first row represents various parameters: `node_num` is the total number of nodes, `gpus_per_server` refers to the number of GPUs per server (currently, we bind each NIC to a GPU as a single node), `nvswitch_num` indicates the number of NVSwitch nodes (specifically used to implement the NVLS algorithm), `switch_num` is the number of switches, `link_num` is the total number of connections, and `gpu_type_str` describes the type of GPU. | Abbreviation | Description | |--------------------|-------------------------------------------------| @@ -173,13 +173,18 @@ python3 ./astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.p ``` simai_topo -The `link_num` is `20` because each ASW (Aggregation Switch) is connected to a single PSW (Pod Switch, node 17). Since the topology uses `-psn 1` (single-plane topology), only one plane of PSWs is utilized, limiting the connections to `4` between ASWs and PSWs. - -To increase the `link_num` to `24`, you need to enable a dual-plane topology by setting `-ps`n to `2` in the command. This will activate both planes of PSWs, doubling the connections between ASWs and PSWs. For example: - -```bash -python3 ./astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py -g 8 -gt A100 -bw 400Gbps -nvbw 2400Gbps -psn 2 -``` +> 💡 **Important Note**: note that the generated file features a mismatch between `link_num` in the first line (i.e., `20`), and the number +> of links described below, which are `24` instead. Hence, the last `4` links will be ignored by the simulator. +> In particular, only half of the links between ASWs and the PSW are actually used. This happens because the topology +> generator script assumes that parameter `-psn` refers to the total number of PSW switches in the case of dual plane +> topology; however, by default the script generates a single plane topology, halving the links to PSW switches. +> +> Therefore, when generating single plane topologies, please set parameter `-psn` to the double of the desired number of PSW (e.g., to create one PSW, set it to `2` instead). This will enable +> all the generated links (in the example, it will increase `link_num` from `20` to `24`). +> +>```bash +>python3 ./astra-sim-alibabacloud/inputs/topo/gen_HPN_7.0_topo_mulgpus_one_link.py -g 8 -gt A100 -bw 400Gbps -nvbw 2400Gbps -psn 2 +>``` You can choose to customize any `topo` following the format shown above. Of course, we also provide a script to directly generate a `topo` for the HPN architecture.