diff --git a/.github/workflows/release_wheel.yml b/.github/workflows/release_wheel.yml
new file mode 100644
index 00000000..60fab503
--- /dev/null
+++ b/.github/workflows/release_wheel.yml
@@ -0,0 +1,106 @@
+# Adapted from https://github.com/punica-ai/punica/blob/591b59899f0a20760821785d06b331c8a2e5cb86/.github/workflows/release_wheel.yml
+name: Release
+on:
+  workflow_call:
+    inputs:
+      tag_name:
+        required: true
+        type: string
+    secrets:
+      WHL_TOKEN:
+        required: true
+      # PYPI_TEST_TOKEN:
+      #   required: true
+
+env:
+  TORCH_CUDA_ARCH_LIST: "8.0 8.6 8.9 9.0+PTX"
+  FLASHINFER_CI_TORCH_VERSION: "2.1.0"
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python: ["3.10", "3.11", "3.12"]
+        cuda: ["11.8", "12.1"]
+    runs-on: [self-hosted]
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Build wheel
+        run: |
+          chown -R $CI_UID:$CI_GID "$GITHUB_WORKSPACE"
+          version="$(cat version.txt)"
+          docker run --rm -t \
+              -v "$CI_RUNNER_CACHE_DIR":/ci-cache \
+              -v "$GITHUB_WORKSPACE":/app \
+              -e FLASHINFER_CI_PYTHON_VERSION=${{ matrix.python }} \
+              -e FLASHINFER_CI_CUDA_VERSION=${{ matrix.cuda }} \
+              -e FLASHINFER_CI_TORCH_VERSION=$FLASHINFER_CI_TORCH_VERSION \
+              -e FLASHINFER_BUILD_VERSION=$version \
+              -e TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" \
+              --user $CI_UID:$CI_GID \
+              pytorch/manylinux-builder:cuda${{ matrix.cuda }} \
+              bash /app/scripts/run-ci-build-wheel.sh
+        timeout-minutes: 120
+      - run: du -h python/dist/*
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: wheel-cuda${{ matrix.cuda }}-python${{ matrix.python }}
+          path: python/dist/*
+
+  release:
+    needs: build
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          path: ptyhon/dist/
+          merge-multiple: true
+          pattern: wheel-*
+
+      - run: ls -lah python/dist/
+
+      - uses: softprops/action-gh-release@v1
+        with:
+          tag_name: ${{ inputs.tag_name }}
+          files: |
+            python/dist/flashinfer-*.whl
+            python/dist/flashinfer-*.tar.gz
+
+      - name: Clone wheel index
+        run: git clone https://oauth2:${WHL_TOKEN}@github.com/flashinfer-ai/whl.git flashinfer-whl
+        env:
+          WHL_TOKEN: ${{ secrets.WHL_TOKEN }}
+
+      - name: Update wheel index
+        shell: python
+        run: |
+          import pathlib
+          import hashlib
+          import re
+          for path in sorted(pathlib.Path("python/dist").glob("*.whl")):
+            with open(path, "rb") as f:
+              sha256 = hashlib.sha256(f.read()).hexdigest()
+            ver, cu = re.findall(r"flashinfer-([0-9.]+)\+cu(\d+)-", path.name)[0]
+            with open(f"flashinfer-whl/cu{cu}/flashinfer/index.html", "a") as f:
+              f.write(f'<a href="https://github.com/flashinfer-ai/flashinfer/releases/download/v{ver}/{path.name}#sha256={sha256}">{path.name}</a><br>\n')
+
+      - name: Push wheel index
+        run: |
+          cd flashinfer-whl
+          git config --local user.name "github-actions[bot]"
+          git config --local user.email "41898282+github-actions[bot]@users.noreply.github.com"
+          git add -A
+          git commit -m "update whl"
+          git push
+
+      # - name: Upload sdist to pypi
+      #   run: |
+      #     pip install twine
+      #     python -m twine upload --repository testpypi --username=__token__ dist/*.tar.gz
+      #   env:
+      #     TWINE_PASSWORD: ${{ secrets.PYPI_TEST_TOKEN }}
\ No newline at end of file
diff --git a/python/MANIFEST.in b/python/MANIFEST.in
new file mode 100644
index 00000000..5b44b48e
--- /dev/null
+++ b/python/MANIFEST.in
@@ -0,0 +1,12 @@
+# sdist & wheel
+include version.txt
+recursive-include include *
+recursive-include csrc *
+
+# wheel-only
+exclude flashinfer/_build_meta.py
+exclude tests/
+
+# Unneeded files
+prune */__pycache__
+global-exclude *.so
diff --git a/python/flashinfer/__init__.py b/python/flashinfer/__init__.py
index 385c7865..39242819 100644
--- a/python/flashinfer/__init__.py
+++ b/python/flashinfer/__init__.py
@@ -28,5 +28,3 @@
     BatchDecodeWithPagedKVCacheWrapper,
     BatchPrefillWithPagedKVCacheWrapper,
 )
-
-__version__ = "0.0.1"
diff --git a/python/include b/python/include
new file mode 120000
index 00000000..3a1af68f
--- /dev/null
+++ b/python/include
@@ -0,0 +1 @@
+../include/
\ No newline at end of file
diff --git a/python/setup.py b/python/setup.py
index b65c3682..de01bc61 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -14,19 +14,45 @@
 limitations under the License.
 """
 import pathlib
+import os
+import re
+import datetime
+import subprocess
+import platform
 
 import setuptools
+import torch
 import torch.utils.cpp_extension as torch_cpp_ext
 
 root = pathlib.Path(__name__).parent
 
 
-def get_version(path):
-    with open(path) as f:
-        for line in f:
-            if line.startswith("__version__"):
-                return line.split("=", maxsplit=1)[1].replace('"', "").strip()
-    raise ValueError("Version not found")
+def get_version():
+    with open(root / "version.txt") as f:
+        version = f.read().strip()
+    return version
+
+
+def get_cuda_version() -> tuple[int, int]:
+    if torch_cpp_ext.CUDA_HOME is None:
+        nvcc = "nvcc"
+    else:
+        nvcc = os.path.join(torch_cpp_ext.CUDA_HOME, "bin/nvcc")
+    txt = subprocess.check_output([nvcc, "--version"], text=True)
+    major, minor = map(int, re.findall(r"release (\d+)\.(\d+),", txt)[0])
+    return major, minor
+
+
+def generate_build_meta() -> None:
+    d = {}
+    version = get_version()
+    d["cuda_major"], d["cuda_minor"] = get_cuda_version()
+    d["torch"] = torch.__version__
+    d["python"] = platform.python_version()
+    d["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", None)
+    with open(root / "flashinfer/_build_meta.py", "w") as f:
+        f.write(f"__version__ = {version!r}\n")
+        f.write(f"build_meta = {d!r}")
 
 
 def remove_unwanted_pytorch_nvcc_flags():
@@ -43,38 +69,40 @@ def remove_unwanted_pytorch_nvcc_flags():
             pass
 
 
-remove_unwanted_pytorch_nvcc_flags()
-ext_modules = []
-ext_modules.append(
-    torch_cpp_ext.CUDAExtension(
-        name="flashinfer.ops._kernels",
-        sources=[
-            "csrc/single_decode.cu",
-            "csrc/single_prefill.cu",
-            "csrc/cascade.cu",
-            "csrc/batch_decode.cu",
-            "csrc/flashinfer_ops.cu",
-            "csrc/batch_prefill.cu",
-        ],
-        include_dirs=[
-            str(root.resolve().parent / "include"),
-        ],
-        extra_compile_args={
-            "cxx": ["-O3"],
-            "nvcc": ["-O3", "--threads", "8"],
-        },
+if __name__ == "__main__":
+    remove_unwanted_pytorch_nvcc_flags()
+    generate_build_meta()
+    ext_modules = []
+    ext_modules.append(
+        torch_cpp_ext.CUDAExtension(
+            name="flashinfer.ops._kernels",
+            sources=[
+                "csrc/single_decode.cu",
+                "csrc/single_prefill.cu",
+                "csrc/cascade.cu",
+                "csrc/batch_decode.cu",
+                "csrc/flashinfer_ops.cu",
+                "csrc/batch_prefill.cu",
+            ],
+            include_dirs=[
+                str(root.resolve() / "include"),
+            ],
+            extra_compile_args={
+                "cxx": ["-O3"],
+                "nvcc": ["-O3", "--threads", "8"],
+            },
+        )
+    )
+
+    setuptools.setup(
+        name="flashinfer",
+        version=get_version(),
+        packages=setuptools.find_packages(),
+        author="FlashInfer team",
+        license="Apache License 2.0",
+        description="FlashInfer: Kernel Library for LLM Serving",
+        url="https://github.com/flashinfer-ai/flashinfer",
+        python_requires=">=3.9",
+        ext_modules=ext_modules,
+        cmdclass={"build_ext": torch_cpp_ext.BuildExtension},
     )
-)
-
-setuptools.setup(
-    name="flashinfer",
-    version=get_version(root / "flashinfer/__init__.py"),
-    packages=setuptools.find_packages(),
-    author="FlashInfer team",
-    license="Apache License 2.0",
-    description="FlashInfer: Kernel Library for LLM Serving",
-    url="https://github.com/flashinfer-ai/flashinfer",
-    python_requires=">=3.9",
-    ext_modules=ext_modules,
-    cmdclass={"build_ext": torch_cpp_ext.BuildExtension},
-)
diff --git a/python/version.txt b/python/version.txt
new file mode 120000
index 00000000..aa4e5bec
--- /dev/null
+++ b/python/version.txt
@@ -0,0 +1 @@
+../version.txt
\ No newline at end of file
diff --git a/scripts/ci-flashinfer.env.example b/scripts/ci-flashinfer.env.example
new file mode 100644
index 00000000..d7c879a6
--- /dev/null
+++ b/scripts/ci-flashinfer.env.example
@@ -0,0 +1,8 @@
+RUNNER_SCOPE=repo
+REPO_URL=https://github.com/flashinfer-ai/flashinfer
+#LABELS=gpu,sm80
+ACCESS_TOKEN=foo-access-token
+RUNNER_WORKDIR=/tmp/ci-flashinfer
+CI_RUNNER_CACHE_DIR=/data/ci-flashinfer-cache
+DISABLE_AUTO_UPDATE=1
+EPHEMERAL=1
diff --git a/scripts/ci-flashinfer.service b/scripts/ci-flashinfer.service
new file mode 100644
index 00000000..81e04080
--- /dev/null
+++ b/scripts/ci-flashinfer.service
@@ -0,0 +1,27 @@
+# https://github.com/myoung34/docker-github-actions-runner/wiki/Usage
+# Install with:
+#   install -m 644 ci-flashinfer.service $HOME/.config/systemd/user/
+#   systemctl --user daemon-reload
+# Run with:
+#   systemctl --user start ci-flashinfer
+# Stop with:
+#   systemctl --user stop ci-flashinfer
+# See live logs with:
+#   journalctl -f -u ci-flashinfer.service --no-hostname --no-tail
+[Unit]
+Description=Ephemeral GitHub Actions Runner Container for flashinfer-ai/flashinfer
+[Service]
+TimeoutStartSec=0
+Restart=always
+ExecStartPre=-/usr/bin/docker stop %N
+ExecStartPre=-/usr/bin/docker rm %N
+ExecStartPre=-/usr/bin/docker pull myoung34/github-runner:latest
+ExecStart=/usr/bin/docker run --rm \
+                              --env-file %h/.config/ci-flashinfer.env \
+                              -e RUNNER_NAME=%H \
+                              -e CI_UID=%U \
+                              -e CI_GID=%G \
+                              -v /var/run/docker.sock:/var/run/docker.sock \
+                              -v /tmp/ci-flashinfer:/tmp/ci-flashinfer \
+                              --name %N \
+                              myoung34/github-runner:latest
diff --git a/scripts/run-ci-build-wheel.sh b/scripts/run-ci-build-wheel.sh
new file mode 100644
index 00000000..16750526
--- /dev/null
+++ b/scripts/run-ci-build-wheel.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# adapted from https://github.com/punica-ai/punica/blob/591b59899f0a20760821785d06b331c8a2e5cb86/ci/run-ci-build-wheel.bash
+set -e
+
+assert_env() {
+    local var_name="$1"
+    if [ -z "${!var_name}" ]; then
+        echo "Error: Environment variable '$var_name' is not set."
+        exit 1
+    fi
+}
+
+assert_env FLASHINFER_CI_PYTHON_VERSION
+assert_env FLASHINFER_CI_TORCH_VERSION
+assert_env FLASHINFER_CI_CUDA_VERSION
+assert_env FLASHINFER_BUILD_VERSION
+assert_env TORCH_CUDA_ARCH_LIST
+PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+export CONDA_pkgs_dirs=/ci-cache/conda-pkgs
+export XDG_CACHE_HOME=/ci-cache/xdg-cache
+mkdir -p "$CONDA_pkgs_dirs" "$XDG_CACHE_HOME"
+export HOME=/tmp/home
+mkdir -p $HOME
+export PATH="$HOME/.local/bin:$PATH"
+CUDA_MAJOR="${FLASHINFER_CI_CUDA_VERSION%.*}"
+CUDA_MINOR="${FLASHINFER_CI_CUDA_VERSION#*.}"
+PYVER="${FLASHINFER_CI_PYTHON_VERSION//./}"
+export PATH="/opt/python/cp${PYVER}-cp${PYVER}/bin:$PATH"
+
+
+echo "::group::Install PyTorch"
+pip install torch==$FLASHINFER_CI_TORCH_VERSION --index-url "https://download.pytorch.org/whl/cu${CUDA_MAJOR}${CUDA_MINOR}"
+echo "::endgroup::"
+
+echo "::group::Install build system"
+pip install ninja numpy
+pip install --upgrade setuptools wheel build
+echo "::endgroup::"
+
+
+echo "::group::Build wheel for FlashInfer"
+cd "$PROJECT_ROOT/python"
+FLASHINFER_BUILD_VERSION="${FLASHINFER_BUILD_VERSION}+cu${CUDA_MAJOR}${CUDA_MINOR}" python -m build --no-isolation
+rm -f dist/*.tar.gz
+python -m build --no-isolation --sdist
+echo "::endgroup::"
diff --git a/version.txt b/version.txt
new file mode 100644
index 00000000..8a9ecc2e
--- /dev/null
+++ b/version.txt
@@ -0,0 +1 @@
+0.0.1
\ No newline at end of file