Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
radekosmulski committed Dec 8, 2022
1 parent 9df1677 commit 62d4ee0
Showing 1 changed file with 41 additions and 43 deletions.
84 changes: 41 additions & 43 deletions examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"downloading ml-25m.zip: 262MB [00:06, 43.0MB/s] \n",
"unzipping files: 100%|██████████| 8/8 [00:08<00:00, 1.12s/files]\n"
"downloading ml-25m.zip: 262MB [00:07, 36.0MB/s] \n",
"unzipping files: 100%|██████████| 8/8 [00:08<00:00, 1.08s/files]\n"
]
}
],
Expand Down Expand Up @@ -108,7 +108,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 4,
"id": "c65e5ef6",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -144,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 5,
"id": "9fbe17a7",
"metadata": {},
"outputs": [
Expand Down Expand Up @@ -186,8 +186,6 @@
"\n",
"DATA_PATH = '/workspace'\n",
"\n",
"\n",
"\n",
"dataset = Dataset(glob(DATA_PATH + f'/train_{hvd.local_rank()}.parquet'), part_size=\"100MB\")\n",
"loader = Loader(dataset, batch_size=64*1024)\n",
"\n",
Expand Down Expand Up @@ -233,7 +231,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 6,
"id": "ec5e9b7f",
"metadata": {
"scrolled": true
Expand All @@ -243,39 +241,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[1,1]<stderr>:2022-12-08 06:00:40.084035: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"[1,1]<stderr>:2022-12-08 06:00:40.160948: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
"[1,1]<stderr>:2022-12-08 06:00:40.161277: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory: -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:89:00.0, compute capability: 7.0\n",
"[1,0]<stderr>:2022-12-08 06:00:40.355181: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"[1,0]<stderr>:2022-12-08 06:58:30.501381: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"[1,0]<stderr>:2022-12-08 06:00:40.461020: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
"[1,0]<stderr>:2022-12-08 06:00:40.461327: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory: -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:86:00.0, compute capability: 7.0\n",
"[1,1]<stderr>:2022-12-08 06:00:45.524363: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
"[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
"[1,1]<stderr>:args {\n",
"[1,1]<stderr>: type_id: TFT_PRODUCT\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_TENSOR\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_BOOL\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>:}\n",
"[1,1]<stderr>: is neither a subtype nor a supertype of the combined inputs preceding it:\n",
"[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
"[1,1]<stderr>:args {\n",
"[1,1]<stderr>: type_id: TFT_PRODUCT\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_TENSOR\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_LEGACY_VARIANT\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>:}\n",
"[1,1]<stderr>:\n",
"[1,1]<stderr>:\twhile inferring type of node 'mean_squared_error/cond/output/_11'\n",
"[1,0]<stderr>:2022-12-08 06:00:45.837188: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
"[1,0]<stderr>:2022-12-08 06:58:30.555187: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
"[1,0]<stderr>:2022-12-08 06:58:30.555454: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory: -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:85:00.0, compute capability: 7.0\n",
"[1,1]<stderr>:2022-12-08 06:58:30.575717: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n",
"[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
"[1,1]<stderr>:2022-12-08 06:58:30.632564: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
"[1,1]<stderr>:2022-12-08 06:58:30.632832: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 16255 MB memory: -> device: 0, name: Tesla V100-SXM2-32GB-LS, pci bus id: 0000:86:00.0, compute capability: 7.0\n",
"[1,0]<stderr>:2022-12-08 06:58:35.010671: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
"[1,0]<stderr>:type_id: TFT_OPTIONAL\n",
"[1,0]<stderr>:args {\n",
"[1,0]<stderr>: type_id: TFT_PRODUCT\n",
Expand All @@ -299,13 +273,37 @@
"[1,0]<stderr>:}\n",
"[1,0]<stderr>:\n",
"[1,0]<stderr>:\twhile inferring type of node 'mean_squared_error/cond/output/_11'\n",
" 6/191 [..............................] - ETA: 2s - loss: 13.6364 [1,0]<stderr>:/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
"[1,1]<stderr>:2022-12-08 06:58:35.218048: W tensorflow/core/common_runtime/forward_type_inference.cc:231] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:\n",
"[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
"[1,1]<stderr>:args {\n",
"[1,1]<stderr>: type_id: TFT_PRODUCT\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_TENSOR\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_BOOL\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>:}\n",
"[1,1]<stderr>: is neither a subtype nor a supertype of the combined inputs preceding it:\n",
"[1,1]<stderr>:type_id: TFT_OPTIONAL\n",
"[1,1]<stderr>:args {\n",
"[1,1]<stderr>: type_id: TFT_PRODUCT\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_TENSOR\n",
"[1,1]<stderr>: args {\n",
"[1,1]<stderr>: type_id: TFT_LEGACY_VARIANT\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>: }\n",
"[1,1]<stderr>:}\n",
"[1,1]<stderr>:\n",
"[1,1]<stderr>:\twhile inferring type of node 'mean_squared_error/cond/output/_11'\n",
" 6/191 [..............................] - ETA: 2s - loss: 13.6433 [1,0]<stderr>:/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
"[1,0]<stderr>: warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n",
"[1,0]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0096s vs `on_train_batch_end` time: 0.1717s). Check your callbacks.\n",
"[1,0]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0094s vs `on_train_batch_end` time: 0.1490s). Check your callbacks.\n",
"[1,1]<stderr>:/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (1.26.12) or chardet (3.0.4) doesn't match a supported version!\n",
"[1,1]<stderr>: warnings.warn(\"urllib3 ({}) or chardet ({}) doesn't match a supported \"\n",
"[1,1]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0093s vs `on_train_batch_end` time: 0.1719s). Check your callbacks.\n",
"191/191 [==============================] - 10s 14ms/step - loss: 3.3301stdout>[1,0]<stdout[1,0]<stdout[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>\n"
"[1,1]<stderr>:WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.0093s vs `on_train_batch_end` time: 0.1489s). Check your callbacks.\n",
"191/191 [==============================] - 8s 12ms/step - loss: 3.3301<stdout>[1,0]<stdout[1,0]<stdout[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>[1,0]<stdout>\n"
]
}
],
Expand Down

0 comments on commit 62d4ee0

Please # to comment.