From 19fd3c242ccbed1f81cb896541c4519fcef7b66e Mon Sep 17 00:00:00 2001
From: QuantumJaeYoo <jhyoocoderusa@gmail.com>
Date: Fri, 16 Jun 2023 23:30:15 +0000
Subject: [PATCH] Add cuQuantum accelerated 4 sim C++ ops + 1 adj C++ op

---
 WORKSPACE                                     |  33 +-
 configure.sh                                  | 130 ++-
 scripts/test_all.sh                           |  16 +-
 tensorflow_quantum/core/ops/BUILD             | 107 +-
 .../core/ops/tfq_adj_grad_op_cuquantum.cu.cc  | 342 +++++++
 .../core/ops/tfq_adj_grad_op_cuquantum.py     |  48 +
 .../ops/tfq_adj_grad_op_cuquantum_test.py     | 490 ++++++++++
 ...fq_simulate_expectation_op_cuquantum.cu.cc |  46 +-
 .../ops/tfq_simulate_ops_cuquantum_test.py    | 918 ++++++++++++++++++
 ...ate_sampled_expectation_op_cuquantum.cu.cc | 256 +++++
 .../tfq_simulate_samples_op_cuquantum.cu.cc   | 232 +++++
 .../ops/tfq_simulate_state_op_cuquantum.cu.cc | 217 +++++
 third_party/cuquantum/BUILD                   |   0
 third_party/cuquantum/BUILD.tpl               |  23 +
 third_party/cuquantum/cuquantum_configure.bzl | 246 +++++
 15 files changed, 2968 insertions(+), 136 deletions(-)
 create mode 100644 tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc
 create mode 100644 tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
 create mode 100644 tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py
 create mode 100644 tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py
 create mode 100644 tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
 create mode 100644 tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc
 create mode 100644 tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc
 create mode 100644 third_party/cuquantum/BUILD
 create mode 100644 third_party/cuquantum/BUILD.tpl
 create mode 100644 third_party/cuquantum/cuquantum_configure.bzl

diff --git a/WORKSPACE b/WORKSPACE
index 6a29d598b..e28452a8d 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -24,19 +24,11 @@ cc_library(
         ],
 )
 
-# http_archive(
-#     name = "qsim",
-#     sha256 = "b9c1eba09a885a938b5e73dfc2e02f5231cf3b01d899415caa24769346a731d5",
-#     strip_prefix = "qsim-0.13.3",
-#     urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.13.3.zip"],
-# )
-
-# TODO: After merging this patch later into qsim mainstream, remove this and uncomment the above.
 http_archive(
     name = "qsim",
-    sha256 = "",
-    strip_prefix = "qsim-0.15.0-dev20230327_v3",
-    urls = ["https://github.com/jaeyoo/qsim/archive/refs/tags/v0.15.0+dev20230327_v3.tar.gz"],
+    sha256 = "f7f410a07543a51b254f7a5810b5153e196a4c7b4ec89dc8faf86f9c77eec97b",
+    strip_prefix = "qsim-0.16.1",
+    urls = ["https://github.com/quantumlib/qsim/archive/refs/tags/v0.16.1.zip"],
 )
 
 http_archive(
@@ -81,21 +73,6 @@ bind(
     actual = "@six_archive//:six",
 )
 
-new_local_repository(
-    name = "cuquantum_libs",
-    path = "/usr/local/google/home/jaeyoo/workspace/cuquantum-linux-x86_64-22.11.0.13-archive",
-    build_file_content = """
-cc_library(
-    name = "custatevec_headers",
-    srcs = ["include/custatevec.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "custatevec",
-    srcs = ["lib/libcustatevec.so"],
-    visibility = ["//visibility:public"],
-)
-""",
-)
+load("//third_party/cuquantum:cuquantum_configure.bzl", "cuquantum_configure")
 
+cuquantum_configure(name = "local_config_cuquantum")
diff --git a/configure.sh b/configure.sh
index 0ca4a0ae4..0ba9c9f63 100755
--- a/configure.sh
+++ b/configure.sh
@@ -20,11 +20,11 @@ function write_to_bazelrc() {
 }
 
 function write_action_env_to_bazelrc() {
-  write_to_bazelrc "build --action_env $1=\"$2\""
+  write_to_bazelrc "$1 --action_env $2=\"$3\""
 }
 
 function write_linkopt_dir_to_bazelrc() {
-  write_to_bazelrc "build --linkopt -Wl,-rpath,$1" >> .bazelrc
+  write_to_bazelrc "$1 --linkopt -Wl,-rpath,$2" >> .bazelrc
 }
 
 
@@ -49,48 +49,81 @@ function is_ppc64le() {
 # Remove .bazelrc if it already exist
 [ -e .bazelrc ] && rm .bazelrc
 
-# Check if we are building GPU or CPU ops, default CPU
-while [[ "$TF_NEED_CUDA" == "" ]]; do
-  read -p "Do you want to build ops again TensorFlow CPU pip package?"\
-" Y or enter for CPU (tensorflow-cpu), N for GPU (tensorflow). [Y/n] " INPUT
+# Check if we are building TFQ GPU or not (TODO)
+while [[ "$TFQ_NEED_CUDA" == "" ]]; do
+  read -p "Do you want to build TFQ against CPU?"\
+" Y or enter for CPU, N for GPU. [Y/n] " INPUT
   case $INPUT in
-    [Yy]* ) echo "Build with CPU pip package."; TF_NEED_CUDA=0;;
-    [Nn]* ) echo "Build with GPU pip package."; TF_NEED_CUDA=1;;
-    "" ) echo "Build with CPU pip package."; TF_NEED_CUDA=0;;
+    [Yy]* ) echo "Build with CPU ops only."; TFQ_NEED_CUDA=0;;
+    [Nn]* ) echo "Build with cuQuantum support."; TFQ_NEED_CUDA=1;;
+    "" ) echo "Build with CPU ops only."; TFQ_NEED_CUDA=0;;
     * ) echo "Invalid selection: " $INPUT;;
   esac
 done
 
-while [[ "$TF_CUDA_VERSION" == "" ]]; do
-  read -p "Are you building against TensorFlow 2.11(including RCs) or newer?[Y/n] " INPUT
-  case $INPUT in
-    [Yy]* ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
-    [Nn]* ) echo "Build against TensorFlow <2.11."; TF_CUDA_VERSION=10.0;;
-    "" ) echo "Build against TensorFlow 2.11 or newer."; TF_CUDA_VERSION=11;;
-    * ) echo "Invalid selection: " $INPUT;;
-  esac
-done
+# Set the CUDA SDK version for TF
+if [[ "$TFQ_NEED_CUDA" == "1" ]]; then
+  _DEFAULT_CUDA_VERSION=11
+  while [[ "$TF_CUDA_VERSION" == "" ]]; do
+    read -p "Please specify the CUDA SDK major version you want to use. [Leave empty to default to CUDA $_DEFAULT_CUDA_VERSION]: " INPUT
+    case $INPUT in
+      "" ) echo "Build against CUDA $_DEFAULT_CUDA_VERSION."; TF_CUDA_VERSION=$_DEFAULT_CUDA_VERSION;;
+      # check if the input is a number
+      *[!0-9]* ) echo "Invalid selection: $INPUT";;
+      * ) echo "Build against CUDA $INPUT."; TF_CUDA_VERSION=$INPUT;;
+    esac
+  done
+fi
+
+# If TFQ_NEED_CUDA then enforce building against TensorFlow 2.11 or newer.
+IS_VALID_TF_VERSION=$(python -c "import tensorflow as tf; v = tf.__version__; print(float(v[:v.rfind('.')]) < 2.11)")
+TF_VERSION=$(python -c "import tensorflow as tf; print(tf.__version__)")
+if [[ $IS_VALID_TF_VERSION == "True" ]]; then
+  echo "Building against TensorFlow 2.11 or newer is required."
+  echo "Please upgrade your TensorFlow version."
+  exit 1
+elif [[ $IS_VALID_TF_VERSION == "False" ]]; then
+  echo "Using TensorFlow 2.11"
+else
+  echo "Unable to determine TensorFlow version."
+  exit 1
+fi
 
+# Check if we are building cuQuantum ops on top of CUDA.
+if [[ "$TFQ_NEED_CUDA" == "1" ]]; then
+  if [[ "$CUQUANTUM_ROOT" != "" ]]; then
+    echo "  [*] cuQuantum library is detected here: CUQUANTUM_ROOT=$CUQUANTUM_ROOT."
+  else
+    # Prompt the user to enter the cuQuantum root path, do not allow empty input (pressing enter)
+    # If the user enters an invalid path, prompt again.
+    while true; do
+      read -p "Please specify the cuQuantum root directory: " INPUT
+      if [[ -z "$INPUT" ]]; then
+        echo "Input cannot be empty. Please enter a valid path."
+      elif [[ "$INPUT" =~ ^(/[A-Za-z0-9_-]+)+$ ]]; then
+        echo "Path pattern is valid: $INPUT"
+        CUQUANTUM_ROOT=$INPUT
+        break
+      else
+        echo "Invalid path pattern: $INPUT. Please enter a valid path."
+      fi
+    done
+  fi
+  write_action_env_to_bazelrc "build:cuda" "CUQUANTUM_ROOT" ${CUQUANTUM_ROOT}
+  write_linkopt_dir_to_bazelrc "build:cuda" "${CUQUANTUM_ROOT}/lib"
+fi
 
 # Check if it's installed
 if [[ $(pip show tensorflow) == *tensorflow* ]] || [[ $(pip show tf-nightly) == *tf-nightly* ]]; then
-  echo 'Using installed tensorflow'
+  echo "Using installed tensorflow-($TF_VERSION)"
 else
-  # Uninstall CPU version if it is installed.
-  if [[ $(pip show tensorflow-cpu) == *tensorflow-cpu* ]]; then
-    echo 'Already have tensorflow non-gpu installed. Uninstalling......\n'
-    pip uninstall tensorflow
-  elif [[ $(pip show tf-nightly-cpu) == *tf-nightly-cpu* ]]; then
-    echo 'Already have tensorflow non-gpu installed. Uninstalling......\n'
-    pip uninstall tf-nightly
-  fi
-  # Install GPU version
-  echo 'Installing tensorflow .....\n'
-  pip install tensorflow
+  echo 'Installing tensorflow 2.11 .....\n'
+  pip install tensorflow==2.11.0
 fi
 
 
 
+
 TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
 TF_LFLAGS="$(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))')"
 
@@ -101,7 +134,8 @@ write_to_bazelrc "build --strategy=Genrule=standalone"
 write_to_bazelrc "build -c opt"
 write_to_bazelrc "build --cxxopt=\"-D_GLIBCXX_USE_CXX11_ABI=1\""
 write_to_bazelrc "build --cxxopt=\"-std=c++17\""
-
+write_to_bazelrc "build --cxxopt=\"-O3\""
+write_to_bazelrc "build --cxxopt=\"-march=native\""
 
 if is_windows; then
   # Use pywrap_tensorflow instead of tensorflow_framework on Windows
@@ -127,31 +161,39 @@ if is_windows; then
   SHARED_LIBRARY_NAME=${SHARED_LIBRARY_NAME//\\//}
   HEADER_DIR=${HEADER_DIR//\\//}
 fi
-write_action_env_to_bazelrc "TF_HEADER_DIR" ${HEADER_DIR}
-write_action_env_to_bazelrc "TF_SHARED_LIBRARY_DIR" ${SHARED_LIBRARY_DIR}
-write_action_env_to_bazelrc "TF_SHARED_LIBRARY_NAME" ${SHARED_LIBRARY_NAME}
-write_action_env_to_bazelrc "TF_NEED_CUDA" ${TF_NEED_CUDA}
+
+TF_NEED_CUDA=${TFQ_NEED_CUDA}
+write_action_env_to_bazelrc "build" "TF_HEADER_DIR" ${HEADER_DIR} ""
+write_action_env_to_bazelrc "build" "TF_SHARED_LIBRARY_DIR" ${SHARED_LIBRARY_DIR} ""
+write_action_env_to_bazelrc "build" "TF_SHARED_LIBRARY_NAME" ${SHARED_LIBRARY_NAME} ""
+write_action_env_to_bazelrc "build" "TF_NEED_CUDA" ${TF_NEED_CUDA} ""
 
 if ! is_windows; then
-  write_linkopt_dir_to_bazelrc ${SHARED_LIBRARY_DIR}
+  write_linkopt_dir_to_bazelrc "build"  ${SHARED_LIBRARY_DIR} ""
 fi
 
 # TODO(yifeif): do not hardcode path
 if [[ "$TF_NEED_CUDA" == "1" ]]; then
-  write_to_bazelrc "build:cuda --define=using_cuda=true --define=using_cuda_nvcc=true"
+  write_to_bazelrc "build:cuda --experimental_repo_remote_exec"
+  write_to_bazelrc "build:cuda --spawn_strategy=standalone"
+  write_to_bazelrc "build:cuda --strategy=Genrule=standalone"
+  write_to_bazelrc "build:cuda -c opt"
+  write_to_bazelrc "build:cuda --cxxopt=\"-D_GLIBCXX_USE_CXX11_ABI=1\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-std=c++17\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-O3\""
+  write_to_bazelrc "build:cuda --cxxopt=\"-march=native\""
   write_to_bazelrc "build:cuda --@local_config_cuda//:enable_cuda"
   write_to_bazelrc "build:cuda --crosstool_top=@local_config_cuda//crosstool:toolchain"
 
-  write_action_env_to_bazelrc "TF_CUDA_VERSION" ${TF_CUDA_VERSION}
-  write_action_env_to_bazelrc "TF_CUDNN_VERSION" "8"
+  write_action_env_to_bazelrc "build:cuda" "TF_CUDA_VERSION" ${TF_CUDA_VERSION} 
+  write_action_env_to_bazelrc "build:cuda" "TF_CUDNN_VERSION" "8"
   if is_windows; then
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
+    write_action_env_to_bazelrc "build:cuda" "CUDNN_INSTALL_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
+    write_action_env_to_bazelrc "build:cuda" "CUDA_TOOLKIT_PATH" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${TF_CUDA_VERSION}"
   else
-    write_action_env_to_bazelrc "CUDNN_INSTALL_PATH" "/usr/lib/x86_64-linux-gnu"
-    write_action_env_to_bazelrc "CUDA_TOOLKIT_PATH" "/usr/local/cuda"
+    write_action_env_to_bazelrc "build:cuda" "CUDNN_INSTALL_PATH" "/usr/lib/x86_64-linux-gnu"
+    write_action_env_to_bazelrc "build:cuda" "CUDA_TOOLKIT_PATH" "/usr/local/cuda"
   fi
   write_to_bazelrc "build --config=cuda"
   write_to_bazelrc "test --config=cuda"
 fi
-
diff --git a/scripts/test_all.sh b/scripts/test_all.sh
index 2795e0429..ffb43d42d 100755
--- a/scripts/test_all.sh
+++ b/scripts/test_all.sh
@@ -14,7 +14,21 @@
 # limitations under the License.
 # =============================================================================
 echo "Testing All Bazel py_test and cc_tests.";
-test_outputs=$(bazel test -c opt --experimental_repo_remote_exec --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" --cxxopt="-std=c++17" --cxxopt="-msse2" --cxxopt="-msse3" --cxxopt="-msse4" --notest_keep_going --test_output=errors //tensorflow_quantum/...)
+ENABLE_CUDA=${1}
+
+if [[ ${ENABLE_CUDA} == "gpu" ]]; then
+  echo "GPU mode. CUDA config is set."
+  CUDA_CONFIG="--config=cuda"
+  # Tests all including cuquantum ops.
+  TAG_FILTER=""
+else
+  echo "CPU mode."
+  CUDA_CONFIG=""
+  # Tests cpu only excluding cuquantum ops.
+  TAG_FILTER="--test_tag_filters=-cuquantum --build_tag_filters=-cuquantum"
+fi
+
+test_outputs=$(bazel test -c opt ${CUDA_CONFIG} ${TAG_FILTER} --experimental_repo_remote_exec --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=1" --cxxopt="-std=c++17" --cxxopt="-msse2" --cxxopt="-msse3" --cxxopt="-msse4" --test_output=errors //tensorflow_quantum/...)
 exit_code=$?
 if [ "$exit_code" == "0" ]; then
 	echo "Testing Complete!";
diff --git a/tensorflow_quantum/core/ops/BUILD b/tensorflow_quantum/core/ops/BUILD
index 84361cef1..6ce3e28ae 100644
--- a/tensorflow_quantum/core/ops/BUILD
+++ b/tensorflow_quantum/core/ops/BUILD
@@ -1,6 +1,5 @@
 # load op_wrapper
-load("@org_tensorflow//tensorflow:tensorflow.bzl", "tf_gpu_kernel_library", "tf_gen_op_wrapper_py")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 
 package(default_visibility = ["//visibility:public"])
 
@@ -50,7 +49,6 @@ py_library(
         "//tensorflow_quantum/core/ops/math_ops:fidelity_op_py",
         "//tensorflow_quantum/core/ops/noise:noisy_expectation_op_py",
     ] + if_cuda_is_configured([
-        ":tfq_simulate_ops_cuda_py",
         ":tfq_simulate_ops_cuquantum_py",
     ]),
 )
@@ -641,19 +639,6 @@ py_test(
     ],
 )
 
-py_library(
-    name = "tfq_simulate_ops_cuda_py",
-    srcs = ["tfq_simulate_ops_cuda.py"],
-    data = [
-        ":_tfq_simulate_ops_cuda.so",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        # tensorflow framework for wrappers
-        ":load_module",
-    ],
-)
-
 py_library(
     name = "tfq_simulate_ops_cuquantum_py",
     srcs = ["tfq_simulate_ops_cuquantum.py"],
@@ -668,21 +653,25 @@ py_library(
 )
 
 py_test(
-    name = "tfq_simulate_ops_gpu_test",
-    srcs = ["tfq_simulate_ops_gpu_test.py"],
+    name = "tfq_simulate_ops_cuquantum_test",
+    timeout = "long",
+    srcs = ["tfq_simulate_ops_cuquantum_test.py"],
     deps = [
-        ":tfq_simulate_ops_cuda_py",
         ":tfq_simulate_ops_cuquantum_py",
         ":tfq_simulate_ops_py",
         "//tensorflow_quantum/python:util",
     ],
     srcs_version = "PY3",
+    tags = ["cuquantum"],
 )
 
 cc_binary(
-    name = "_tfq_simulate_ops_cuda.so",
+    name = "_tfq_simulate_ops_cuquantum.so",
     srcs = [
-        "tfq_simulate_expectation_op_cuda.cu.cc",
+        "tfq_simulate_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_sampled_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_samples_op_cuquantum.cu.cc",
+        "tfq_simulate_state_op_cuquantum.cu.cc",
     ],
     linkshared = 1,
     features = select({
@@ -717,23 +706,25 @@ cc_binary(
             "/wd4577",
             "/DNOGDI",
             "/UTF_COMPILE_LIBRARY",
+            "/D__CUSTATEVEC__",
         ],
         "//conditions:default": [
             "-Iexternal/local_cuda/cuda/include",
-            # "--cuda-gpu-arch=sm_86",
-            # "-L/usr/local/cuda/lib64",
-            # "-lcudart_static",
-            # "-ldl",
-            # "-lrt",
             "-pthread",
             "-std=c++17",
             "-D_GLIBCXX_USE_CXX11_ABI=1",
             "-O3",
             "-Iexternal/cuda_headers",
             "-DNV_CUDNN_DISABLE_EXCEPTION",
-            # "-fpermissive",
         ],
-    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    }) + if_cuda_is_configured([
+        "-DTENSORFLOW_USE_NVCC=1",
+        "-DGOOGLE_CUDA=1",
+        "-x cuda",
+        "-nvcc_options=relaxed-constexpr",
+        "-nvcc_options=ftz=true",
+        "-D__CUSTATEVEC__",
+    ]),
     deps = [
         # cirq cc proto
         "//tensorflow_quantum/core/ops:parse_context",
@@ -749,16 +740,20 @@ cc_binary(
         # tensorflow core protos
     ] + if_cuda_is_configured([
         ":cuda",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@qsim//lib:qsim_cuda_lib",
+        "@local_config_cuquantum//:cuquantum_headers",
+        "@local_config_cuquantum//:libcuquantum",
+        "@qsim//lib:qsim_cuquantum_lib",
     ]),
-    # alwayslink=1,
+    tags = ["cuquantum"],
 )
 
 cc_binary(
     name = "_tfq_simulate_ops_cuquantum.so",
     srcs = [
         "tfq_simulate_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_sampled_expectation_op_cuquantum.cu.cc",
+        "tfq_simulate_samples_op_cuquantum.cu.cc",
+        "tfq_simulate_state_op_cuquantum.cu.cc",
     ],
     linkshared = 1,
     features = select({
@@ -793,23 +788,25 @@ cc_binary(
             "/wd4577",
             "/DNOGDI",
             "/UTF_COMPILE_LIBRARY",
+            "/D__CUSTATEVEC__",
         ],
         "//conditions:default": [
             "-Iexternal/local_cuda/cuda/include",
-            # "--cuda-gpu-arch=sm_86",
-            # "-L/usr/local/cuda/lib64",
-            # "-lcudart_static",
-            # "-ldl",
-            # "-lrt",
             "-pthread",
             "-std=c++17",
             "-D_GLIBCXX_USE_CXX11_ABI=1",
             "-O3",
             "-Iexternal/cuda_headers",
             "-DNV_CUDNN_DISABLE_EXCEPTION",
-            # "-fpermissive",
         ],
-    }) + if_cuda_is_configured(["-DTENSORFLOW_USE_NVCC=1", "-DGOOGLE_CUDA=1", "-x cuda", "-nvcc_options=relaxed-constexpr", "-nvcc_options=ftz=true"]),
+    }) + if_cuda_is_configured([
+        "-DTENSORFLOW_USE_NVCC=1",
+        "-DGOOGLE_CUDA=1",
+        "-x cuda",
+        "-nvcc_options=relaxed-constexpr",
+        "-nvcc_options=ftz=true",
+        "-D__CUSTATEVEC__",
+    ]),
     deps = [
         # cirq cc proto
         "//tensorflow_quantum/core/ops:parse_context",
@@ -825,12 +822,38 @@ cc_binary(
         # tensorflow core protos
     ] + if_cuda_is_configured([
         ":cuda",
-        "@cuquantum_libs//:custatevec",
-        "@cuquantum_libs//:custatevec_headers",
-        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_cuquantum//:cuquantum_headers",
+        "@local_config_cuquantum//:libcuquantum",
         "@qsim//lib:qsim_cuquantum_lib",
     ]),
-    # alwayslink=1,
+    tags = ["cuquantum"],
+)
+
+py_library(
+    name = "tfq_adj_grad_op_cuquantum_py",
+    srcs = ["tfq_adj_grad_op_cuquantum.py"],
+    data = [":_tfq_adj_grad_cuquantum.so"],
+    srcs_version = "PY3",
+    deps = [
+        ":load_module",
+        # pauli sum cc proto
+        # projector sum cc proto
+        # tensorflow framework for wrappers
+    ],
+    tags = ["cuquantum"],
+)
+
+py_test(
+    name = "tfq_adj_grad_op_cuquantum_test",
+    srcs = ["tfq_adj_grad_op_cuquantum_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":tfq_adj_grad_op_cuquantum_py",
+        ":tfq_adj_grad_op_py", # for testing cpu vs gpu diff
+        "//tensorflow_quantum/python:util",
+    ],
+    srcs_version = "PY3",
+    tags = ["cuquantum"],
 )
 
 py_library(
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc
new file mode 100644
index 000000000..55213c78b
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.cu.cc
@@ -0,0 +1,342 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/seqfor.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/adj_util.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+namespace {
+// TODO(jaeyoo): Temorary hack for BulkSetAmpl with cuda ops.
+// Updates qsim custatevec side BulkSetAmple ops, and remove these utilities.
+template <typename FP>
+__global__ void BulkSetAmplKernel(uint64_t mask, uint64_t bits, FP re, FP im,
+                                  bool exclude, FP* state) {
+  uint64_t k1 = uint64_t{blockIdx.x} * blockDim.x + threadIdx.x;
+
+  bool set = ((k1 & mask) == bits) ^ exclude;
+
+  if (set) {
+    state[2 * k1] = re;
+    state[2 * k1 + 1] = im;
+  }
+}
+
+// Sets state[i] = complex(re, im) where (i & mask) == bits.
+// if `exclude` is true then the criteria becomes (i & mask) != bits.
+template <typename fp_type>
+void BulkSetAmpl(qsim::SimulatorCuStateVec<float>::StateSpace::State& state,
+                 uint64_t mask, uint64_t bits, fp_type re, fp_type im,
+                 bool exclude = false) {
+  uint64_t size = uint64_t{1} << state.num_qubits();
+
+  unsigned threads = std::min(size, uint64_t{512});
+  unsigned blocks = size / threads;
+
+  BulkSetAmplKernel<<<blocks, threads>>>(mask, bits, re, im, exclude,
+                                         state.get());
+  cudaPeekAtLastError();
+  cudaDeviceSynchronize();
+}
+}  // namespace
+
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqAdjointGradientCuquantumOp : public tensorflow::OpKernel {
+ public:
+  explicit TfqAdjointGradientCuquantumOp(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    // create handles for simulator
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqAdjointGradientCuquantumOp() {
+    // destroy handles in sync with simulator lifetime
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 5,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_param_size = context->input(2).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_param_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->matrix<float>();
+
+    // Parse program protos.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> full_fuse(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+    std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>
+        partial_fused_circuits(
+            programs.size(),
+            std::vector<std::vector<qsim::GateFused<QsimGate>>>({}));
+
+    // track metadata.
+    std::vector<std::vector<tfq::GateMetaData>> gate_meta(
+        programs.size(), std::vector<tfq::GateMetaData>({}));
+
+    // track gradients
+    std::vector<std::vector<GradientOfGate>> gradient_gates(
+        programs.size(), std::vector<GradientOfGate>({}));
+
+    Status parse_status = ::tensorflow::Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local = QsimCircuitFromProgram(programs[i], maps[i],
+                                              num_qubits[i], &qsim_circuits[i],
+                                              &full_fuse[i], &gate_meta[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+        CreateGradientCircuit(qsim_circuits[i], gate_meta[i],
+                              &partial_fused_circuits[i], &gradient_gates[i]);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Get downstream gradients.
+    std::vector<std::vector<float>> downstream_grads;
+    OP_REQUIRES_OK(context, GetPrevGrads(context, &downstream_grads));
+
+    OP_REQUIRES(context, downstream_grads.size() == programs.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of gradients and circuits do not match. Got ",
+                    downstream_grads.size(), " gradients and ", programs.size(),
+                    " circuits.")));
+
+    OP_REQUIRES(
+        context, context->input(4).dim_size(1) == context->input(3).dim_size(1),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of gradients and pauli sum dimension do not match. Got ",
+            context->input(4).dim_size(1), " gradient entries and ",
+            context->input(3).dim_size(1), " paulis per circuit.")));
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    output_tensor.setZero();
+
+    ComputeLarge(num_qubits, qsim_circuits, maps, full_fuse,
+                 partial_fused_circuits, pauli_sums, gradient_gates,
+                 downstream_grads, context, &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<QsimCircuit>& qsim_circuits,
+      const std::vector<SymbolMap>& maps,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& full_fuse,
+      const std::vector<std::vector<std::vector<qsim::GateFused<QsimGate>>>>&
+          partial_fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      const std::vector<std::vector<tfq::GradientOfGate>>& gradient_gates,
+      const std::vector<std::vector<float>>& downstream_grads,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+    auto scratch2 = ss.Create(largest_nq);
+
+    for (size_t i = 0; i < partial_fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+        scratch2 = ss.Create(largest_nq);
+      }
+
+      // (#679) Just ignore empty program
+      if (qsim_circuits[i].gates.size() == 0) {
+        continue;
+      }
+
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < full_fuse[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, full_fuse[i][j], sv);
+      }
+
+      // sv now contains psi
+      // scratch contains (sum_j paulis_sums[i][j] * downstream_grads[j])|psi>
+      // scratch2 now contains psi as well.
+      [[maybe_unused]] Status unused = AccumulateOperators(
+          pauli_sums[i], downstream_grads[i], sim, ss, sv, scratch2, scratch);
+
+      for (int j = partial_fused_circuits[i].size() - 1; j >= 0; j--) {
+        for (int k = partial_fused_circuits[i][j].size() - 1; k >= 0; k--) {
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][j][k], sv);
+          ApplyFusedGateDagger(sim, partial_fused_circuits[i][j][k], scratch);
+        }
+        if (j == 0) {
+          // last layer will have no parametrized gates so can break.
+          break;
+        }
+
+        // Hit a parameterized gate.
+        // todo fix this copy.
+        auto cur_gate = qsim_circuits[i].gates[gradient_gates[i][j - 1].index];
+        ApplyGateDagger(sim, cur_gate, sv);
+
+        // if applicable compute control qubit mask and control value bits.
+        uint64_t mask = 0;
+        uint64_t cbits = 0;
+        for (size_t k = 0; k < cur_gate.controlled_by.size(); k++) {
+          uint64_t control_loc = cur_gate.controlled_by[k];
+          mask |= uint64_t{1} << control_loc;
+          cbits |= ((cur_gate.cmask >> k) & 1) << control_loc;
+        }
+
+        for (size_t k = 0; k < gradient_gates[i][j - 1].grad_gates.size();
+             k++) {
+          // Copy sv onto scratch2 in anticipation of non-unitary "gradient
+          // gate".
+          ss.Copy(sv, scratch2);
+          if (!cur_gate.controlled_by.empty()) {
+            // Gradient of controlled gates puts zeros on diagonal which is
+            // the same as collapsing the state and then applying the
+            // non-controlled version of the gradient gate.
+            BulkSetAmpl<float>(scratch2, mask, cbits, 0, 0, true);
+          }
+          qsim::ApplyGate(sim, gradient_gates[i][j - 1].grad_gates[k],
+                          scratch2);
+
+          // don't need not-found check since this is done upstream already.
+          const auto it = maps[i].find(gradient_gates[i][j - 1].params[k]);
+          const int loc = it->second.first;
+          // Apply finite differencing for adjoint gradients.
+          // Finite differencing enables applying multiple `gradient_gate`
+          // of a symbol at the same circuit. For analytic methods like
+          // parameter-shift we need to apply a single `gradient_gate`
+          // per a symbol.
+          (*output_tensor)(i, loc) += ss.RealInnerProduct(scratch2, scratch) +
+                                      ss.RealInnerProduct(scratch, scratch2);
+        }
+        ApplyGateDagger(sim, cur_gate, scratch);
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqAdjointGradientCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqAdjointGradientCuquantumOp);
+
+REGISTER_OP("TfqAdjointGradientCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Input("downstream_grads: float")
+    .Output("grads: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::ShapeHandle downstream_grads_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &downstream_grads_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(symbol_names_shape, 0);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
new file mode 100644
index 000000000..e73775a45
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum.py
@@ -0,0 +1,48 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Module to register python op gradient."""
+import tensorflow as tf
+from tensorflow_quantum.core.ops.load_module import load_module
+
+SIM_OP_MODULE = load_module("_tfq_adj_grad_cuquantum.so")
+
+
+def tfq_adj_grad(programs, symbol_names, symbol_values, pauli_sums, prev_grad):
+    """Calculate gradient of expectation value of circuits wrt some operator(s).
+
+    Args:
+        programs: `tf.Tensor` of strings with shape [batch_size] containing
+            the string representations of the circuits to be executed.
+        symbol_names: `tf.Tensor` of strings with shape [n_params], which
+            is used to specify the order in which the values in
+            `symbol_values` should be placed inside of the circuits in
+            `programs`.
+        symbol_values: `tf.Tensor` of real numbers with shape
+            [batch_size, n_params] specifying parameter values to resolve
+            into the circuits specificed by programs, following the ordering
+            dictated by `symbol_names`.
+        pauli_sums: `tf.Tensor` of strings with shape [batch_size, n_ops]
+            containing the string representation of the operators that will
+            be used on all of the circuits in the expectation calculations.
+        prev_grad: `tf.Tensor` of real numbers with shape [batch_size, n_ops]
+            backprop of values from downstream in the compute graph.
+    Returns:
+        `tf.Tensor` with shape [batch_size, n_params] that holds the gradient of
+            expectation value for each circuit with each op applied to it
+            (after resolving the corresponding parameters in).
+    """
+    return SIM_OP_MODULE.tfq_adjoint_gradient_cuquantum(
+        programs, symbol_names, tf.cast(symbol_values, tf.float32), pauli_sums,
+        tf.cast(prev_grad, tf.float32))
diff --git a/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py
new file mode 100644
index 000000000..262f81728
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_adj_grad_op_cuquantum_test.py
@@ -0,0 +1,490 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests that specifically target tfq_unitary_op."""
+# Remove PYTHONPATH collisions for protobuf.
+# pylint: disable=wrong-import-position
+import sys
+NEW_PATH = [x for x in sys.path if 'com_google_protobuf' not in x]
+sys.path = NEW_PATH
+# pylint: enable=wrong-import-position
+
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+import sympy
+
+from tensorflow_quantum.python import util
+from tensorflow_quantum.core.ops import tfq_adj_grad_op
+from tensorflow_quantum.core.ops import tfq_adj_grad_op_cuquantum
+
+
+def measure_average_runtime(
+        fn,
+        tag,
+        num_samples=10,
+        result_avg=False,
+):
+    """Measures average runtime for given function.
+
+    Args:
+        fn: function.
+        tag: The message title.
+        num_samples: The number of measurements.
+        result_avg: True if the results are all averaged.
+
+    Returns:
+        The average time and the (averaged) result.
+    """
+    avg_time = []
+    avg_res = []
+    for _ in range(num_samples):
+        begin_time = time.time()
+        result = fn()
+        duration = time.time() - begin_time
+        avg_time.append(duration)
+        if result_avg:
+            avg_res.append(result)
+    avg_time = sum(avg_time) / float(num_samples)
+    print(f"\n\t{tag} time: {avg_time}\n")
+    if result_avg:
+        result = np.average(avg_res, axis=0)
+    return avg_time, result
+
+
+class ADJGradTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_calculate_unitary."""
+
+    def test_calculate_adj_grad_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        prev_grads = tf.ones([batch_size, len(symbol_names)])
+
+        cpu_avg_time, res_cpu = measure_average_runtime(
+            lambda: tfq_adj_grad_op.tfq_adj_grad(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                prev_grads),
+            "Adjoint CPU",
+            num_samples=10,
+            result_avg=True,
+        )
+
+        cuquantum_avg_time, res_cuquantum = measure_average_runtime(
+            lambda: tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                prev_grads),
+            "Adjoint cuQuantum",
+            num_samples=10,
+            result_avg=True,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+        self.assertGreater(cpu_avg_time, cuquantum_avg_time)
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_adj_grad_inputs(self):
+        """Make sure that the expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        upstream_grads = np.ones((batch_size, len(symbol_names)))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(np.array([symbol_values_array])),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array[0]),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor(list(pauli_sums)),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[[x]] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                ['junk'] * batch_size, symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in new_pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                [['junk']] * batch_size, tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                [1.0] * batch_size, symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array), [[1.0]] * batch_size,
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                tf.convert_to_tensor(upstream_grads))
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([cirq.Circuit()]), symbol_names,
+                symbol_values_array.astype(np.float64),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='rank 2'):
+            # wrong grad shape.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([upstream_grads]))
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='gradients and circuits do not match'):
+            # wrong grad batch size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([[0 for i in range(len(symbol_names))]]))
+
+        with self.assertRaisesRegex(
+                tf.errors.InvalidArgumentError,
+                expected_regex='gradients and pauli sum dimension do not match'
+        ):
+            # wrong grad inner size.
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor([[0, 0] for _ in range(len(circuit_batch))
+                                     ]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, tf.convert_to_tensor(symbol_values_array),
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                tf.convert_to_tensor(upstream_grads))
+
+    def test_calculate_adj_grad_empty(self):
+        """Verify that the empty case is handled gracefully."""
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor([cirq.Circuit()]),
+            tf.convert_to_tensor([], dtype=tf.dtypes.string),
+            tf.convert_to_tensor([[]]),
+            tf.convert_to_tensor([[]], dtype=tf.dtypes.string),
+            tf.convert_to_tensor([[]]))
+        self.assertShapeEqual(np.zeros((1, 0)), out)
+
+    def test_calculate_adj_grad_no_circuit(self):
+        """Verify that the no circuit case is handled gracefully."""
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            tf.raw_ops.Empty(shape=(0,), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0,), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.string),
+            tf.raw_ops.Empty(shape=(0, 0), dtype=tf.float32),
+        )
+        self.assertShapeEqual(np.zeros((0, 0)), out)
+
+    def test_calculate_adj_grad_simple_case(self):
+        """Make sure that adjoint gradient works on simple input case."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]))], [{'alpha': 0.123, 'beta': 0.456}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(symbol_names)])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out, np.array([[-1.18392, 0.43281]]), atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case2(self):
+        """Make sure the adjoint gradient works on another simple input case."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(sympy.Symbol('gamma'), 0.5)(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out,
+                            np.array([[-2.100, -1.7412, -1.5120]]),
+                            atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case_shared(self):
+        """Make sure the adjoint gradient works on a shared symbol gate."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('beta'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(
+                sympy.Symbol('gamma'),
+                sympy.Symbol('gamma'))(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out,
+                            np.array([[-2.3484, -1.7532, -1.64264]]),
+                            atol=1e-3)
+
+    def test_calculate_adj_grad_simple_case_single(self):
+        """Make sure the adjoint gradient works on a one symbol for all gate."""
+        n_qubits = 2
+        batch_size = 1
+        symbol_names = ['alpha', 'beta', 'gamma']
+        qubits = cirq.LineQubit.range(n_qubits)
+        circuit_batch, resolver_batch = \
+        [cirq.Circuit(cirq.X(qubits[0]) ** sympy.Symbol('alpha'),
+            cirq.Y(qubits[1]) ** sympy.Symbol('alpha'),
+            cirq.CNOT(qubits[0], qubits[1]),
+            cirq.FSimGate(
+                -0.56,
+                sympy.Symbol('alpha'))(qubits[0], qubits[1]))
+        ], [{'alpha': 0.123, 'beta': 0.456, 'gamma': 0.789}]
+
+        op_batch = [
+            [cirq.Z(qubits[0]), cirq.X(qubits[1])] for _ in range(batch_size)
+        ]
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        prev_grads = tf.ones([batch_size, len(op_batch[0])])
+
+        out = tfq_adj_grad_op_cuquantum.tfq_adj_grad(
+            util.convert_to_tensor(circuit_batch),
+            tf.convert_to_tensor(symbol_names),
+            tf.convert_to_tensor(symbol_values_array),
+            util.convert_to_tensor(op_batch), prev_grads)
+
+        self.assertAllClose(out, np.array([[1.2993, 0, 0]]), atol=1e-3)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
index 6c9f03eb7..82d6939fc 100644
--- a/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
+++ b/tensorflow_quantum/core/ops/tfq_simulate_expectation_op_cuquantum.cu.cc
@@ -1,8 +1,11 @@
 /* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
+
     http://www.apache.org/licenses/LICENSE-2.0
+
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -10,19 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <vector>
+
+#include <custatevec.h>
 
 #include <chrono>
+#include <memory>
+#include <vector>
 
-#include "../cuquantum_libs/include/custatevec.h"
 #include "../qsim/lib/circuit.h"
 #include "../qsim/lib/gate_appl.h"
 #include "../qsim/lib/gates_cirq.h"
 #include "../qsim/lib/gates_qsim.h"
-#include "../qsim/lib/seqfor.h"
-#include "../qsim/lib/simulator_custatevec.h"
-#include "../qsim/lib/statespace_custatevec.h"
+#include "../qsim/lib/simmux_gpu.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -48,7 +50,17 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
  public:
   explicit TfqSimulateExpectationOpCuQuantum(
       tensorflow::OpKernelConstruction* context)
-      : OpKernel(context) {}
+      : OpKernel(context) {
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateExpectationOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
 
   void Compute(tensorflow::OpKernelContext* context) override {
     // TODO (mbbrough): add more dimension checks for other inputs here.
@@ -93,7 +105,7 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
     std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
         programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
 
-    Status parse_status = Status::OK();
+    Status parse_status = ::tensorflow::Status();
     auto p_lock = tensorflow::mutex();
     auto construct_f = [&](int start, int end) {
       for (int i = start; i < end; i++) {
@@ -114,16 +126,8 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       max_num_qubits = std::max(max_num_qubits, num);
     }
 
-    // create handles for simulator
-    cublasCreate(&cublas_handle_);
-    custatevecCreate(&custatevec_handle_);
-
     ComputeLarge(num_qubits, fused_circuits, pauli_sums, context,
                  &output_tensor);
-
-    // destroy handles in sync with simulator lifetime
-    cublasDestroy(cublas_handle_);
-    custatevecDestroy(custatevec_handle_);
   }
 
  private:
@@ -153,7 +157,7 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
     // Simulate programs one by one. Parallelizing over state vectors
     // we no longer parallelize over circuits. Each time we encounter a
     // a larger circuit we will grow the Statevector as necessary.
-    for (int i = 0; i < fused_circuits.size(); i++) {
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
       int nq = num_qubits[i];
 
       if (nq > largest_nq) {
@@ -166,10 +170,10 @@ class TfqSimulateExpectationOpCuQuantum : public tensorflow::OpKernel {
       //  the state if there is a possibility that circuit[i] and
       //  circuit[i + 1] produce the same state.
       ss.SetStateZero(sv);
-      for (int j = 0; j < fused_circuits[i].size(); j++) {
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
         qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
       }
-      for (int j = 0; j < pauli_sums[i].size(); j++) {
+      for (size_t j = 0; j < pauli_sums[i].size(); j++) {
         // (#679) Just ignore empty program
         if (fused_circuits[i].size() == 0) {
           (*output_tensor)(i, j) = -2.0;
@@ -214,7 +218,7 @@ REGISTER_OP("TfqSimulateExpectationCuquantum")
           c->Dim(pauli_sums_shape, 1);
       c->set_output(0, c->Matrix(output_rows, output_cols));
 
-      return tensorflow::Status::OK();
+      return ::tensorflow::Status();
     });
 
-}  // namespace tfq
+}  // namespace tfq
\ No newline at end of file
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py
new file mode 100644
index 000000000..f31f7241c
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_ops_cuquantum_test.py
@@ -0,0 +1,918 @@
+# Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests that specifically target tfq_simulate_ops_cuquantum."""
+import time
+import numpy as np
+from absl.testing import parameterized
+import tensorflow as tf
+import cirq
+
+from tensorflow_quantum.core.ops import tfq_simulate_ops
+from tensorflow_quantum.core.ops import tfq_simulate_ops_cuquantum
+from tensorflow_quantum.python import util
+
+
+def measure_average_runtime(
+        fn,
+        tag,
+        num_samples=10,
+        result_avg=False,
+):
+    """Measures average runtime for given function.
+
+    Args:
+        fn: function.
+        tag: The message title.
+        num_samples: The number of measurements.
+        result_avg: True if the results are all averaged.
+
+    Returns:
+        The average time and the (averaged) result.
+    """
+    avg_time = []
+    avg_res = []
+    for _ in range(num_samples):
+        begin_time = time.time()
+        result = fn()
+        duration = time.time() - begin_time
+        avg_time.append(duration)
+        if result_avg:
+            avg_res.append(result)
+    avg_time = sum(avg_time) / float(num_samples)
+    print(f"\n\t{tag} time: {avg_time}\n")
+    if result_avg:
+        result = np.average(avg_res, axis=0)
+    return avg_time, result
+
+
+class SimulateExpectationCuquantumTest(tf.test.TestCase):
+    """Tests tfq_simulate_expectation."""
+
+    def test_simulate_expectation_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation CPU",
+            num_samples=100,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor),
+            "Expectation cuQuantum",
+            num_samples=100,
+        )
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_expectation_inputs(self):
+        """Make sure that the expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0],
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, util.convert_to_tensor(list(pauli_sums)))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[[x]] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [['junk']] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums
+                                       ][:int(batch_size * 0.5)]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)],
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+                util.convert_to_tensor([noisy_circuit for _ in pauli_sums]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]))
+
+        res = tfq_simulate_ops_cuquantum.tfq_simulate_expectation(
+            util.convert_to_tensor([cirq.Circuit() for _ in pauli_sums]),
+            symbol_names, symbol_values_array.astype(np.float64),
+            util.convert_to_tensor([[x] for x in pauli_sums]))
+        self.assertDTypeEqual(res, np.float32)
+
+
+class SimulateSampledExpectationCuquantumTest(tf.test.TestCase):
+    """Tests tfq_simulate_sampled_expectation."""
+
+    def test_simulate_sampled_expectation_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        n_samples = [[10000]] * batch_size
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        pauli_sums_tensor = util.convert_to_tensor([[x] for x in pauli_sums])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation CPU",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), pauli_sums_tensor,
+                n_samples),
+            "SampledExpectation cuQuantum",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=0.07,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_sampled_expectation_inputs(self):
+        """Make sure sampled expectation op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        pauli_sums = util.random_pauli_sums(qubits, 3, batch_size)
+        num_samples = [[10]] * batch_size
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # Circuit tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1.'):
+            # symbol_names tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2.'):
+            # symbol_values_array tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too few dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor(list(pauli_sums)), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'pauli_sums must be rank 2.'):
+            # pauli_sums tensor has too many dimensions.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                [util.convert_to_tensor([[x] for x in pauli_sums])],
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'num_samples must be rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                num_samples[0])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # circuit tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                ['junk'] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type but invalid values.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'qubits not found in circuit'):
+            # pauli_sums tensor has the right type but invalid values.
+            new_qubits = [cirq.GridQubit(5, 5), cirq.GridQubit(9, 9)]
+            new_pauli_sums = util.random_pauli_sums(new_qubits, 2, batch_size)
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in new_pauli_sums]),
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # pauli_sums tensor has the right type but invalid values 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [['junk']] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # circuits tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                [1.0] * batch_size, symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), [0.1234],
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # pauli_sums tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[1.0]] * batch_size, num_samples)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # we are missing an argument.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, num_samples)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), [],
+                num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong op size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([cirq.Circuit()]), symbol_names,
+                symbol_values_array.astype(np.float64),
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'greater than 0'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]),
+                [[-1]] * batch_size)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)],
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_sampled_expectation(
+                util.convert_to_tensor([noisy_circuit for _ in pauli_sums]),
+                symbol_names, symbol_values_array,
+                util.convert_to_tensor([[x] for x in pauli_sums]), num_samples)
+
+
+class SimulateSamplesCuquantumTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_simulate_samples."""
+
+    def test_simulate_samples_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        n_samples = [100]
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples CPU",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64), n_samples),
+            "Samples cuQuantum",
+            num_samples=10,
+            result_avg=False,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        res_cpu = np.average(res_cpu, axis=1)
+        res_cuquantum = np.average(res_cuquantum, axis=1)
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=0.3,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_samples_inputs(self):
+        """Make sure the sample op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        num_samples = 10
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # programs tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # symbol_names tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 2. Got rank 3'):
+            # symbol_values tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]), [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 2. Got rank 1'):
+            # symbol_values tensor has the wrong shape 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0], [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'rank 1. Got rank 2'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [[num_samples]])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # programs tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(\
+                                                  ['junk'] * batch_size,
+                                                  symbol_names,
+                                                  symbol_values_array,
+                                                  [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array, [num_samples])
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples([1] * batch_size,
+                                                            symbol_names,
+                                                            symbol_values_array,
+                                                            [num_samples])
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), [1], symbol_values_array,
+                [num_samples])
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError,
+                                    'Cast string to float is not supported'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size, [num_samples])
+
+        with self.assertRaisesRegex(Exception, 'junk'):
+            # num_samples tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, ['junk'])
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # too few tensors.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array)
+            # pylint: enable=no-value-for-parameter
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)], num_samples)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_samples(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, symbol_values_array, [num_samples])
+
+    @parameterized.parameters([
+        {
+            'all_n_qubits': [2, 3],
+            'n_samples': 10
+        },
+        {
+            'all_n_qubits': [1, 5, 8],
+            'n_samples': 10
+        },
+    ])
+    def test_sampling_output_padding(self, all_n_qubits, n_samples):
+        """Check that the sampling ops pad outputs correctly"""
+        op = tfq_simulate_ops_cuquantum.tfq_simulate_samples
+        circuits = []
+        expected_outputs = []
+        for n_qubits in all_n_qubits:
+            this_expected_output = np.zeros((n_samples, max(all_n_qubits)))
+            this_expected_output[:, max(all_n_qubits) - n_qubits:] = 1
+            this_expected_output[:, :max(all_n_qubits) - n_qubits] = -2
+            expected_outputs.append(this_expected_output)
+            circuits.append(
+                cirq.Circuit(*cirq.X.on_each(
+                    *cirq.GridQubit.rect(1, n_qubits))))
+        results = op(util.convert_to_tensor(circuits), [], [[]] * len(circuits),
+                     [n_samples]).numpy()
+        self.assertAllClose(expected_outputs, results)
+
+
+class SimulateStateCuquantumTest(tf.test.TestCase, parameterized.TestCase):
+    """Tests tfq_simulate_samples."""
+
+    def test_simulate_state_cpu_vs_cuquantum(self):
+        """Make sure that cpu & gpu(cuquantum) ops have the same results."""
+        n_qubits = 20
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        circuit_batch_tensor = util.convert_to_tensor(circuit_batch)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        _, res_cpu = measure_average_runtime(
+            lambda: tfq_simulate_ops.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State CPU",
+            num_samples=10,
+        )
+
+        _, res_cuquantum = measure_average_runtime(
+            lambda: tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                circuit_batch_tensor, symbol_names,
+                symbol_values_array.astype(np.float64)),
+            "State cuQuantum",
+            num_samples=10,
+        )
+
+        # cuQuantum op should be faster than CPU op.
+
+        # The result should be the similar within a tolerance.
+        np.testing.assert_allclose(res_cpu,
+                                   res_cuquantum,
+                                   atol=1e-4,
+                                   err_msg="""
+        # If failed, the GPU architecture in this system may be unsupported.
+        # Please refer to the supported architectures here.
+        # https://docs.nvidia.com/cuda/cuquantum/getting_started.html#custatevec
+        """)
+
+    def test_simulate_state_inputs(self):
+        """Make sure the state op fails gracefully on bad inputs."""
+        n_qubits = 5
+        batch_size = 5
+        symbol_names = ['alpha']
+        qubits = cirq.GridQubit.rect(1, n_qubits)
+        circuit_batch, resolver_batch = \
+            util.random_symbol_circuit_resolver_batch(
+                qubits, symbol_names, batch_size)
+
+        symbol_values_array = np.array(
+            [[resolver[symbol]
+              for symbol in symbol_names]
+             for resolver in resolver_batch])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'programs must be rank 1'):
+            # programs tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor([circuit_batch]), symbol_names,
+                symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_names must be rank 1'):
+            # symbol_names tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), np.array([symbol_names]),
+                symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2'):
+            # symbol_values tensor has the wrong shape.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                np.array([symbol_values_array]))
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'symbol_values must be rank 2'):
+            # symbol_values tensor has the wrong shape 2.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[0])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Unparseable proto'):
+            # programs tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(['junk'] * batch_size,
+                                                          symbol_names,
+                                                          symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    'Could not find symbol in parameter map'):
+            # symbol_names tensor has the right type, but invalid value.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), ['junk'],
+                symbol_values_array)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # programs tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state([1] * batch_size,
+                                                          symbol_names,
+                                                          symbol_values_array)
+
+        with self.assertRaisesRegex(TypeError, 'Cannot convert'):
+            # symbol_names tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), [1], symbol_values_array)
+
+        with self.assertRaisesRegex(tf.errors.UnimplementedError, ''):
+            # symbol_values tensor has the wrong type.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                [['junk']] * batch_size)
+
+        with self.assertRaisesRegex(TypeError, 'missing'):
+            # too few tensors.
+            # pylint: disable=no-value-for-parameter
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names)
+            # pylint: enable=no-value-for-parameter
+
+        # TODO (mbbrough): determine if we should allow extra arguments ?
+        with self.assertRaisesRegex(TypeError, 'positional arguments'):
+            # pylint: disable=too-many-function-args
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array, [])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='do not match'):
+            # wrong symbol_values size.
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor(circuit_batch), symbol_names,
+                symbol_values_array[:int(batch_size * 0.5)])
+
+        with self.assertRaisesRegex(tf.errors.InvalidArgumentError,
+                                    expected_regex='cirq.Channel'):
+            # attempting to use noisy circuit.
+            noisy_circuit = cirq.Circuit(cirq.depolarize(0.3).on_each(*qubits))
+            tfq_simulate_ops_cuquantum.tfq_simulate_state(
+                util.convert_to_tensor([noisy_circuit for _ in circuit_batch]),
+                symbol_names, symbol_values_array)
+
+    @parameterized.parameters([
+        {
+            'all_n_qubits': [2, 3]
+        },
+        {
+            'all_n_qubits': [1, 5, 8]
+        },
+    ])
+    def test_simulate_state_output_padding(self, all_n_qubits):
+        """If a tfq_simulate op is asked to simulate states given circuits
+        acting on different numbers of qubits, the op should return a tensor
+        padded with zeros up to the size of the largest circuit. The padding
+        should be physically correct, such that samples taken from the padded
+        states still match samples taken from the original circuit. """
+        circuit_batch = []
+        for n_qubits in all_n_qubits:
+            qubits = cirq.GridQubit.rect(1, n_qubits)
+            circuit_batch += util.random_circuit_resolver_batch(qubits, 1)[0]
+
+        tfq_results = tfq_simulate_ops_cuquantum.tfq_simulate_state(
+            util.convert_to_tensor(circuit_batch), [],
+            [[]] * len(circuit_batch))
+
+        # Don't use batch_util here to enforce consistent padding everywhere
+        # without extra tests.
+        sim = cirq.Simulator()
+        manual_padded_results = []
+        for circuit in circuit_batch:
+            result = sim.simulate(circuit)
+            wf = result.final_state_vector
+            blank_state = np.ones(
+                (2**max(all_n_qubits)), dtype=np.complex64) * -2
+            blank_state[:wf.shape[0]] = wf
+            manual_padded_results.append(blank_state)
+
+        self.assertAllClose(tfq_results, manual_padded_results, atol=1e-5)
+
+
+if __name__ == "__main__":
+    tf.test.main()
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
new file mode 100644
index 000000000..5d4300fc5
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_sampled_expectation_op_cuquantum.cu.cc
@@ -0,0 +1,256 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+
+#include <chrono>
+#include <memory>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/pauli_sum.pb.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::PauliSum;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqSimulateSampledExpectationOpCuQuantum : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateSampledExpectationOpCuQuantum(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateSampledExpectationOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    const int num_inputs = context->num_inputs();
+    OP_REQUIRES(context, num_inputs == 5,
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Expected 5 inputs, got ", num_inputs, " inputs.")));
+
+    // Create the output Tensor.
+    const int output_dim_batch_size = context->input(0).dim_size(0);
+    const int output_dim_op_size = context->input(3).dim_size(1);
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_batch_size);
+    output_shape.AddDim(output_dim_op_size);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->matrix<float>();
+
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    std::vector<std::vector<PauliSum>> pauli_sums;
+    OP_REQUIRES_OK(context, GetProgramsAndNumQubits(context, &programs,
+                                                    &num_qubits, &pauli_sums));
+
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+
+    OP_REQUIRES(context, programs.size() == maps.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Number of circuits and symbol_values do not match. Got ",
+                    programs.size(), " circuits and ", maps.size(),
+                    " symbol values.")));
+
+    std::vector<std::vector<int>> num_samples;
+    OP_REQUIRES_OK(context, GetNumSamples(context, &num_samples));
+
+    OP_REQUIRES(context, num_samples.size() == pauli_sums.size(),
+                tensorflow::errors::InvalidArgument(absl::StrCat(
+                    "Dimension 0 of num_samples and pauli_sums do not match.",
+                    "Got ", num_samples.size(), " lists of sample sizes and ",
+                    pauli_sums.size(), " lists of pauli sums.")));
+
+    OP_REQUIRES(
+        context, context->input(4).dim_size(1) == context->input(3).dim_size(1),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Dimension 1 of num_samples and pauli_sums do not match.", "Got ",
+            context->input(4).dim_size(1), " lists of sample sizes and ",
+            context->input(3).dim_size(1), " lists of pauli sums.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = ::tensorflow::Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    ComputeLarge(num_qubits, fused_circuits, pauli_sums, num_samples, context,
+                 &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+  tensorflow::GuardedPhiloxRandom random_gen_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      const std::vector<std::vector<PauliSum>>& pauli_sums,
+      const std::vector<std::vector<int>>& num_samples,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<float, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    auto sv = ss.Create(largest_nq);
+    auto scratch = ss.Create(largest_nq);
+
+    int largest_sum = 0;
+    for (const auto& sums : pauli_sums) {
+      for (const auto& sum : sums) {
+        largest_sum = std::max(largest_sum, sum.terms().size());
+      }
+    }
+    // If empty tensor is fed, just return.
+    if (fused_circuits.size() == 0) return;
+
+    auto local_gen = random_gen_.ReserveSamples32(
+        largest_sum * pauli_sums[0].size() * fused_circuits.size() + 1);
+    tensorflow::random::SimplePhilox rand_source(&local_gen);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        scratch = ss.Create(largest_nq);
+      }
+      // TODO: add heuristic here so that we do not always recompute
+      //  the state if there is a possibility that circuit[i] and
+      //  circuit[i + 1] produce the same state.
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+      for (size_t j = 0; j < pauli_sums[i].size(); j++) {
+        // (#679) Just ignore empty program
+        if (fused_circuits[i].size() == 0) {
+          (*output_tensor)(i, j) = -2.0;
+          continue;
+        }
+        float exp_v = 0.0;
+        OP_REQUIRES_OK(context, ComputeSampledExpectationQsim(
+                                    pauli_sums[i][j], sim, ss, sv, scratch,
+                                    num_samples[i][j], rand_source, &exp_v));
+        (*output_tensor)(i, j) = exp_v;
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TfqSimulateSampledExpectationCuquantum")
+                            .Device(tensorflow::DEVICE_CPU),
+                        TfqSimulateSampledExpectationOpCuQuantum);
+
+REGISTER_OP("TfqSimulateSampledExpectationCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("pauli_sums: string")
+    .Input("num_samples: int32")
+    .SetIsStateful()
+    .Output("expectations: float")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle pauli_sums_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 2, &pauli_sums_shape));
+
+      tensorflow::shape_inference::ShapeHandle num_samples_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 2, &num_samples_shape));
+
+      tensorflow::shape_inference::DimensionHandle output_rows =
+          c->Dim(programs_shape, 0);
+      tensorflow::shape_inference::DimensionHandle output_cols =
+          c->Dim(pauli_sums_shape, 1);
+      c->set_output(0, c->Matrix(output_rows, output_cols));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc
new file mode 100644
index 000000000..aea04e882
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_samples_op_cuquantum.cu.cc
@@ -0,0 +1,232 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+#include <stdlib.h>
+
+#include <chrono>
+#include <string>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/util/guarded_philox_random.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/circuit_parser_qsim.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqSimulateSamplesOpCuQuantum : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateSamplesOpCuQuantum(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, random_gen_.Init(context));
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateSamplesOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    DCHECK_EQ(4, context->num_inputs());
+
+    // Parse to Program Proto and num_qubits.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits));
+
+    // Parse symbol maps for parameter resolution in the circuits.
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+    OP_REQUIRES(
+        context, maps.size() == programs.size(),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of circuits and values do not match. Got ", programs.size(),
+            " circuits and ", maps.size(), " values.")));
+
+    int num_samples = 0;
+    OP_REQUIRES_OK(context, GetIndividualSample(context, &num_samples));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = ::tensorflow::Status();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Find largest circuit for tensor size padding and allocate
+    // the output tensor.
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    const int output_dim_size = maps.size();
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_size);
+    output_shape.AddDim(num_samples);
+    output_shape.AddDim(max_num_qubits);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    auto output_tensor = output->tensor<int8_t, 3>();
+
+    if (num_samples == 0) {
+      return;  // bug in qsim dependency we can't control.
+    }
+
+    ComputeLarge(num_qubits, max_num_qubits, num_samples, fused_circuits,
+                 context, &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+  tensorflow::GuardedPhiloxRandom random_gen_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const int num_samples,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<int8_t, 3>::Tensor* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    int largest_nq = 1;
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    auto sv = ss.Create(largest_nq);
+
+    auto local_gen = random_gen_.ReserveSamples32(fused_circuits.size() + 1);
+    tensorflow::random::SimplePhilox rand_source(&local_gen);
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as nescessary.
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+      }
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+
+      auto samples = ss.Sample(sv, num_samples, rand_source.Rand32());
+      for (int j = 0; j < num_samples; j++) {
+        uint64_t q_ind = 0;
+        uint64_t mask = 1;
+        bool val = 0;
+        while (q_ind < nq) {
+          val = samples[j] & mask;
+          (*output_tensor)(
+              i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = val;
+          q_ind++;
+          mask <<= 1;
+        }
+        while (q_ind < max_num_qubits) {
+          (*output_tensor)(
+              i, j, static_cast<ptrdiff_t>(max_num_qubits - q_ind - 1)) = -2;
+          q_ind++;
+        }
+      }
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateSamplesCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateSamplesOpCuQuantum);
+
+REGISTER_OP("TfqSimulateSamplesCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Input("num_samples: int32")
+    .SetIsStateful()
+    .Output("samples: int8")
+    .Attr("seed: int = 0")
+    .Attr("seed2: int = 0")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      tensorflow::shape_inference::ShapeHandle num_samples_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &num_samples_shape));
+
+      // [batch_size, n_samples, largest_n_qubits]
+      c->set_output(
+          0, c->MakeShape(
+                 {c->Dim(programs_shape, 0),
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim,
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim}));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
diff --git a/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc b/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc
new file mode 100644
index 000000000..abdbe0867
--- /dev/null
+++ b/tensorflow_quantum/core/ops/tfq_simulate_state_op_cuquantum.cu.cc
@@ -0,0 +1,217 @@
+/* Copyright 2020 The TensorFlow Quantum Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <custatevec.h>
+
+#include <chrono>
+#include <string>
+#include <vector>
+
+#include "../qsim/lib/circuit.h"
+#include "../qsim/lib/gate_appl.h"
+#include "../qsim/lib/gates_cirq.h"
+#include "../qsim/lib/simmux_gpu.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow_quantum/core/ops/parse_context.h"
+#include "tensorflow_quantum/core/proto/program.pb.h"
+#include "tensorflow_quantum/core/src/circuit_parser_qsim.h"
+#include "tensorflow_quantum/core/src/util_qsim.h"
+
+namespace tfq {
+
+using ::tensorflow::Status;
+using ::tfq::proto::Program;
+
+typedef qsim::Cirq::GateCirq<float> QsimGate;
+typedef qsim::Circuit<QsimGate> QsimCircuit;
+
+class TfqSimulateStateOpCuQuantum : public tensorflow::OpKernel {
+ public:
+  explicit TfqSimulateStateOpCuQuantum(
+      tensorflow::OpKernelConstruction* context)
+      : OpKernel(context) {
+    // Allocates handlers for initialization.
+    cublasCreate(&cublas_handle_);
+    custatevecCreate(&custatevec_handle_);
+  }
+
+  ~TfqSimulateStateOpCuQuantum() {
+    // Destroys handlers in sync with simulator lifetime.
+    cublasDestroy(cublas_handle_);
+    custatevecDestroy(custatevec_handle_);
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    // TODO (mbbrough): add more dimension checks for other inputs here.
+    DCHECK_EQ(3, context->num_inputs());
+
+    // Parse to Program Proto and num_qubits.
+    std::vector<Program> programs;
+    std::vector<int> num_qubits;
+    OP_REQUIRES_OK(context,
+                   GetProgramsAndNumQubits(context, &programs, &num_qubits));
+
+    // Parse symbol maps for parameter resolution in the circuits.
+    std::vector<SymbolMap> maps;
+    OP_REQUIRES_OK(context, GetSymbolMaps(context, &maps));
+    OP_REQUIRES(
+        context, maps.size() == programs.size(),
+        tensorflow::errors::InvalidArgument(absl::StrCat(
+            "Number of circuits and values do not match. Got ", programs.size(),
+            " circuits and ", maps.size(), " values.")));
+
+    // Construct qsim circuits.
+    std::vector<QsimCircuit> qsim_circuits(programs.size(), QsimCircuit());
+    std::vector<std::vector<qsim::GateFused<QsimGate>>> fused_circuits(
+        programs.size(), std::vector<qsim::GateFused<QsimGate>>({}));
+
+    Status parse_status = Status::OK();
+    auto p_lock = tensorflow::mutex();
+    auto construct_f = [&](int start, int end) {
+      for (int i = start; i < end; i++) {
+        Status local =
+            QsimCircuitFromProgram(programs[i], maps[i], num_qubits[i],
+                                   &qsim_circuits[i], &fused_circuits[i]);
+        NESTED_FN_STATUS_SYNC(parse_status, local, p_lock);
+      }
+    };
+
+    const int num_cycles = 1000;
+    context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+        programs.size(), num_cycles, construct_f);
+    OP_REQUIRES_OK(context, parse_status);
+
+    // Find largest circuit for tensor size padding and allocate
+    // the output tensor.
+    int max_num_qubits = 0;
+    for (const int num : num_qubits) {
+      max_num_qubits = std::max(max_num_qubits, num);
+    }
+
+    const int output_dim_size = maps.size();
+    tensorflow::TensorShape output_shape;
+    output_shape.AddDim(output_dim_size);
+    output_shape.AddDim(1 << max_num_qubits);
+
+    tensorflow::Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
+    tensorflow::TTypes<std::complex<float>, 1>::Matrix output_tensor =
+        output->matrix<std::complex<float>>();
+
+    ComputeLarge(num_qubits, max_num_qubits, fused_circuits, context,
+                 &output_tensor);
+  }
+
+ private:
+  cublasHandle_t cublas_handle_;
+  custatevecHandle_t custatevec_handle_;
+
+  void ComputeLarge(
+      const std::vector<int>& num_qubits, const int max_num_qubits,
+      const std::vector<std::vector<qsim::GateFused<QsimGate>>>& fused_circuits,
+      tensorflow::OpKernelContext* context,
+      tensorflow::TTypes<std::complex<float>, 1>::Matrix* output_tensor) {
+    // Instantiate qsim objects.
+    using Simulator = qsim::SimulatorCuStateVec<float>;
+    using StateSpace = Simulator::StateSpace;
+
+    // Begin simulation.
+    Simulator sim = Simulator(cublas_handle_, custatevec_handle_);
+    StateSpace ss = StateSpace(cublas_handle_, custatevec_handle_);
+    // Begin simulation.
+    int largest_nq = 1;
+    auto sv = ss.Create(largest_nq);
+    std::vector<float> sv_host;
+    sv_host.resize(2 * (uint64_t(1) << largest_nq));
+
+    // Simulate programs one by one. Parallelizing over state vectors
+    // we no longer parallelize over circuits. Each time we encounter a
+    // a larger circuit we will grow the Statevector as necessary.
+    for (size_t i = 0; i < fused_circuits.size(); i++) {
+      int nq = num_qubits[i];
+
+      if (nq > largest_nq) {
+        // need to switch to larger statespace.
+        largest_nq = nq;
+        sv = ss.Create(largest_nq);
+        sv_host.resize(2 * (uint64_t(1) << largest_nq));
+      }
+      ss.SetStateZero(sv);
+      for (size_t j = 0; j < fused_circuits[i].size(); j++) {
+        qsim::ApplyFusedGate(sim, fused_circuits[i][j], sv);
+      }
+
+      // Copy the whole GPU data to CPU memory once.
+      // Please don't use ss.GetAmpl(), because it copies amplitude
+      // one-by-one, which makes huge speed slowdown, even slower than CPU op.
+      ss.Copy(sv, sv_host.data());
+      // Parallel copy state vector information from qsim into tensorflow
+      // tensors. We need type conversions from 2 floats to std::complex.
+      auto copy_f = [i, nq, max_num_qubits, &output_tensor, &sv_host](
+                        uint64_t start, uint64_t end) {
+        uint64_t crossover = uint64_t(1) << nq;
+        uint64_t upper = std::min(end, crossover);
+
+        if (start < crossover) {
+          for (uint64_t j = 0; j < upper; j++) {
+            (*output_tensor)(i, j) =
+                std::complex<float>(sv_host[2 * j], sv_host[2 * j + 1]);
+          }
+        }
+        for (uint64_t j = upper; j < end; j++) {
+          (*output_tensor)(i, j) = std::complex<float>(-2, 0);
+        }
+      };
+      const int num_cycles_copy = 50;
+      context->device()->tensorflow_cpu_worker_threads()->workers->ParallelFor(
+          uint64_t(1) << max_num_qubits, num_cycles_copy, copy_f);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("TfqSimulateStateCuquantum").Device(tensorflow::DEVICE_CPU),
+    TfqSimulateStateOpCuQuantum);
+
+REGISTER_OP("TfqSimulateStateCuquantum")
+    .Input("programs: string")
+    .Input("symbol_names: string")
+    .Input("symbol_values: float")
+    .Output("state_vector: complex64")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      tensorflow::shape_inference::ShapeHandle programs_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &programs_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_names_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &symbol_names_shape));
+
+      tensorflow::shape_inference::ShapeHandle symbol_values_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 2, &symbol_values_shape));
+
+      c->set_output(
+          0, c->MakeShape(
+                 {c->Dim(programs_shape, 0),
+                  tensorflow::shape_inference::InferenceContext::kUnknownDim}));
+
+      return ::tensorflow::Status();
+    });
+
+}  // namespace tfq
diff --git a/third_party/cuquantum/BUILD b/third_party/cuquantum/BUILD
new file mode 100644
index 000000000..e69de29bb
diff --git a/third_party/cuquantum/BUILD.tpl b/third_party/cuquantum/BUILD.tpl
new file mode 100644
index 000000000..ac6c5eb41
--- /dev/null
+++ b/third_party/cuquantum/BUILD.tpl
@@ -0,0 +1,23 @@
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "cuquantum_headers",
+    linkstatic = 1,
+    srcs = [":cuquantum_header_include"],
+    includes = ["include"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libcuquantum",
+    srcs = [
+        ":libcustatevec.so",
+    ],
+    linkopts = [
+        "-Wl,-rpath,%{CUQUANTUM_LIBRARY_PATH}",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+%{CUQUANTUM_HEADER_GENRULE}
+%{CUSTATEVEC_SHARED_LIBRARY_GENRULE}
\ No newline at end of file
diff --git a/third_party/cuquantum/cuquantum_configure.bzl b/third_party/cuquantum/cuquantum_configure.bzl
new file mode 100644
index 000000000..d48656939
--- /dev/null
+++ b/third_party/cuquantum/cuquantum_configure.bzl
@@ -0,0 +1,246 @@
+"""Setup cuQuantum as external dependency."""
+_CUQUANTUM_ROOT = "CUQUANTUM_ROOT"
+
+
+def _tpl(repository_ctx, tpl, substitutions = {}, out = None):
+    if not out:
+        out = tpl
+    repository_ctx.template(
+        out,
+        Label("//third_party/cuquantum:%s.tpl" % tpl),
+        substitutions,
+    )
+
+
+def _fail(msg):
+    """Output failure message when auto configuration fails."""
+    red = "\033[0;31m"
+    no_color = "\033[0m"
+    fail("%sPython Configuration Error:%s %s\n" % (red, no_color, msg))
+
+
+def _warn(msg):
+    """Output warning message when auto configuration warns."""
+    brown = "\033[1;33m"
+    no_color = "\033[0m"
+    print("\n%sAuto-Configuration Warning:%s %s\n" % (brown, no_color, msg))
+
+
+def _execute(
+        repository_ctx,
+        cmdline,
+        error_msg = None,
+        error_details = None,
+        empty_stdout_fine = False):
+    """Executes an arbitrary shell command.
+    Args:
+      repository_ctx: the repository_ctx object
+      cmdline: list of strings, the command to execute
+      error_msg: string, a summary of the error if the command fails
+      error_details: string, details about the error or steps to fix it
+      empty_stdout_fine: bool, if True, an empty stdout result is fine, otherwise
+        it's an error
+    Return:
+      the result of repository_ctx.execute(cmdline)
+    """
+    result = repository_ctx.execute(cmdline)
+    if result.stderr or not (empty_stdout_fine or result.stdout):
+        _fail("\n".join([
+            error_msg.strip() if error_msg else "Repository command failed",
+            result.stderr.strip(),
+            error_details if error_details else "",
+        ]))
+    return result
+
+
+def _read_dir(repository_ctx, src_dir):
+    """Returns a string with all files in a directory.
+    Finds all files inside a directory, traversing subfolders and following
+    symlinks. The returned string contains the full path of all files
+    separated by line breaks.
+    """
+    find_result = _execute(
+        repository_ctx,
+        ["find", src_dir, "-follow", "-type", "f"],
+        empty_stdout_fine = True,
+    )
+    result = find_result.stdout
+    return result
+
+
+def _find_file(repository_ctx, filename):
+    """Returns a string with a directory path including the filename.
+    The returned string contains the parent path of the filename.
+    """
+    result = repository_ctx.execute(
+        ["timeout", "10", "find", "/", "-name", filename, "-print", "-quit", "-not", "-path", "'*/.*'", "-quit"]).stdout
+    result = result[:result.find(filename)+len(filename)]
+    return result
+
+
+def _genrule(genrule_name, command, outs):
+    """Returns a string with a genrule.
+    Genrule executes the given command and produces the given outputs.
+    Args:
+        genrule_name: A unique name for genrule target.
+        command: The command to run.
+        outs: A list of files generated by this rule.
+    Returns:
+        A genrule target.
+    """
+    return (
+        "genrule(\n" +
+        '    name = "' +
+        genrule_name + '",\n' +
+        "    outs = [\n" +
+        outs +
+        "\n    ],\n" +
+        '    cmd = """\n' +
+        command +
+        '\n   """,\n' +
+        ")\n"
+    )
+
+def _norm_path(path):
+    """Returns a path with '/' and remove the trailing slash."""
+    path = path.replace("\\", "/")
+    if path[-1] == "/":
+        path = path[:-1]
+    return path
+
+
+def _symlink_genrule_for_dir(
+        repository_ctx,
+        src_dir,
+        dest_dir,
+        genrule_name,
+        src_files = [],
+        dest_files = [],
+        is_empty_genrule = False):
+    """Returns a genrule to symlink(or copy if on Windows) a set of files.
+    If src_dir is passed, files will be read from the given directory; otherwise
+    we assume files are in src_files and dest_files. Here are the examples:
+    ```
+    genrule(
+        name = "cuquantum_header_include",
+        outs = [
+            "include/custatevec.h",
+            "include/cutensornet.h",
+            "include/cutensornet/types.h",
+            "include/cutensornet/typesDistributed.h",
+        ],
+        cmd = [some copy command lines based on users' local environment],
+    )
+    genrule(
+        name = "libcustatevec.so",
+        outs = [
+            "libcustatevec.so",
+        ],
+        cmd = [some copy command lines based on users' local environment],
+    )
+    ```
+    Args:
+        repository_ctx: the repository_ctx object.
+        src_dir: source directory.
+        dest_dir: directory to create symlink in.
+        genrule_name: genrule name.
+        src_files: list of source files instead of src_dir.
+        dest_files: list of corresonding destination files.
+        is_empty_genrule: True if CUQUANTUM_ROOT is not set.
+    Returns:
+        genrule target that creates the symlinks.
+    """
+    if is_empty_genrule:
+        if dest_dir != "":
+          target_path = "%s/%s.h" % (dest_dir, genrule_name)
+        else:
+          target_path = genrule_name
+        genrule = _genrule(
+            genrule_name,
+            "touch $(OUTS)",
+            "'%s'" % (target_path),
+        )
+        return genrule
+
+    if src_dir != None:
+        src_dir = _norm_path(src_dir)
+        dest_dir = _norm_path(dest_dir)
+        files = "\n".join(sorted(_read_dir(repository_ctx, src_dir).splitlines()))
+
+        dest_files = files.replace(src_dir, "").splitlines()
+        src_files = files.splitlines()
+    command = []
+    outs = []
+
+    for i in range(len(dest_files)):
+        if dest_files[i] != "":
+            # If we have only one file to link we do not want to use the dest_dir, as
+            # $(@D) will include the full path to the file.
+            dest = "$(@D)/" + dest_dir + dest_files[i] if len(dest_files) != 1 else "$(@D)/" + dest_files[i]
+
+            # Copy the headers to create a sandboxable setup.
+            cmd = "cp -f"
+            command.append(cmd + ' "%s" "%s"' % (src_files[i], dest))
+            outs.append('        "' + dest_dir + dest_files[i] + '",')
+
+    genrule = _genrule(
+        genrule_name,
+        " && ".join(command),
+        "\n".join(outs),
+    )
+    return genrule
+
+
+def _cuquantum_pip_impl(repository_ctx):
+    if _CUQUANTUM_ROOT in repository_ctx.os.environ:
+      cuquantum_root = repository_ctx.os.environ[_CUQUANTUM_ROOT]
+    else:
+      repository_ctx.os.environ[_CUQUANTUM_ROOT] = ""
+      cuquantum_root = ""
+    if cuquantum_root == "":
+      # CUQUANTUM_ROOT is empty. Let's find the library root path lazily.
+      cuquantum_header_path = _find_file(repository_ctx, "custatevec.h")
+      cuquantum_header_path = cuquantum_header_path[:cuquantum_header_path.find("/custatevec.h")]
+      custatevec_shared_library_path = _find_file(repository_ctx, "libcustatevec.so")
+      cuquantum_root = custatevec_shared_library_path[:custatevec_shared_library_path.find("/lib/lib")]
+      if cuquantum_root == "":
+        _warn("'CUQUANTUM_ROOT' environment variable is not set, no library was found too. If it is CPU mode, please ignore this warning")
+      else:
+        _warn("'CUQUANTUM_ROOT' environment variable is not set, using '%s' as default" % cuquantum_root)
+    else:
+      cuquantum_header_path = "%s/include" % cuquantum_root
+      custatevec_shared_library_path = "%s/lib/libcustatevec.so" % (cuquantum_root)
+
+    is_empty_genrule = cuquantum_header_path == "" or custatevec_shared_library_path == ""
+
+    cuquantum_header_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        cuquantum_header_path,
+        "include",
+        "cuquantum_header_include",
+        is_empty_genrule=is_empty_genrule,
+    )
+
+    custatevec_shared_library_rule = _symlink_genrule_for_dir(
+        repository_ctx,
+        None,
+        "",
+        "libcustatevec.so",
+        [custatevec_shared_library_path],
+        ["libcustatevec.so"],
+        is_empty_genrule=is_empty_genrule,
+    )
+
+    _tpl(repository_ctx, "BUILD", {
+        "%{CUQUANTUM_LIBRARY_PATH}": "%s/lib" % (cuquantum_root),
+        "%{CUQUANTUM_HEADER_GENRULE}": cuquantum_header_rule,
+        "%{CUSTATEVEC_SHARED_LIBRARY_GENRULE}": custatevec_shared_library_rule,
+    })
+
+
+cuquantum_configure = repository_rule(
+    implementation = _cuquantum_pip_impl,
+    environ = [
+        _CUQUANTUM_ROOT,
+    ],
+)
\ No newline at end of file