Skip to content

Refactoring pairwise into pairwise and gemm #24

New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
44 changes: 44 additions & 0 deletions gemm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Authors: James Bergstra
# License: MIT
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a description of the benchmark and some motivation in a the module level docstring.


"""Computes the GEMM routine from the BLAS standard.

BLAS defines the xGEMM (DGEMM, SGEMM, CGEMM, ZGEMM) operations as
dense matrix multiplication and accumulation as follows:

C <- alpha A x B + beta C

Here (alpha, beta) are scalars and (A, B, C) are matrices.

This operation is one of the most widely-used and most-studied kernels in
high-performance computing, and BLAS implementations such as OpenBLAS, ATLAS,
and MKL provide highly optimized implementations of this operation. GEMM
implementations provide a real-life measure of peak performance on a
particular platform.

Note that the GEMM interface does not actually describe an algorithm, and the
standard does not require particular numerical accuracy. There are sub-cubic
algorithms (e.g. Strassen), and there are also cubic algorithms that are
"blocked" to be more cache-friendly. I believe that OpenBLAS and ATLAS use
blocked cubic algorithms, based on the asymptotic GFLOP/s attributed to MKL,
I would guess it uses blocked cubic algorithms too.

My hope with this benchmark is that it can be used to develop fast, readable
GEMM implementations. I'm curious, for example, if a readable, blocked
algorithm in pure Python could be compiled to a reasonable-performing
implementation.

"""

import numpy as np


def make_env(M=512, N=512, K=512, seed=0, dtype=np.float64,
alpha=1.5,
beta=0.5):
rng = np.random.RandomState(seed)
A = np.asarray(rng.normal(size=(M, K)), dtype=dtype)
B = np.asarray(rng.normal(size=(K, N)), dtype=dtype)
C = np.asarray(rng.normal(size=(M, N)), dtype=dtype)
return (alpha, A, B, beta, C), {}

32 changes: 32 additions & 0 deletions gemm/gemm_cython.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

# Authors: Jake Vanderplas, Olivier Grisel
# License: MIT

import numpy as np
cimport cython
from libc.math cimport sqrt

@cython.boundscheck(False)
@cython.wraparound(False)
def gemm_cython_for_loops(
double alpha,
double[:, ::1] A,
double[:, ::1] B,
double beta,
double[:, ::1] C,
):
cdef int M = C.shape[0]
cdef int N = C.shape[1]
cdef int K = A.shape[1]
cdef double tmp, d
for i in range(M):
for j in range(N):
d = 0.0
for k in range(K):
d += A[i, k] * B[k, j]
C[i, j] = alpha * d + beta * C[i, j]


benchmarks = (
gemm_cython_for_loops,
)
11 changes: 11 additions & 0 deletions gemm/gemm_numba.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Authors: Olivier Grisel
# License: MIT

from gemm import gemm_python
from numba import autojit


benchmarks = (
("gemm_numba_nested_for_loops",
autojit(gemm_python.gemm_python_nested_for_loops)),
)
13 changes: 13 additions & 0 deletions gemm/gemm_parakeet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Authors: Olivier Grisel
# License: MIT

from gemm import gemm_python
from parakeet import jit


benchmarks = (
("gemm_parakeet_nested_for_loops",
jit(gemm_python.gemm_python_nested_for_loops)),
("gemm_parakeet_inner_numpy",
jit(gemm_python.gemm_python_inner_numpy)),
)
10 changes: 10 additions & 0 deletions gemm/gemm_pyopencl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Author: James Bergstra
# License: MIT

# -- https://github.com/jaberg/python-benchmarks-pyopencl
from pybench_pyopencl import gemm_pyopencl

benchmarks = (
gemm_pyopencl.gemm_pyopencl_cpu,
)

45 changes: 45 additions & 0 deletions gemm/gemm_python.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Authors: James Bergstra
# License: MIT

import numpy as np


# -- too slow to run directly, but good for JIT by other systems
def gemm_python_nested_for_loops(alpha, A, B, beta, C):
M, N = C.shape
K = A.shape[1]
#"omp parallel for private(j, d, k, tmp)"
for i in range(M):
for j in range(N):
d = 0.0
for k in range(K):
tmp = A[i, k] * B[j, k]
d += tmp
C[i, j] = alpha * d + beta * C[i, j]
return C


def gemm_python_inner_numpy(alpha, A, B, beta, C):
M, N = C.shape
for i in xrange(M):
for j in xrange(N):
C[i, j] *= beta
C[i, j] += alpha * np.dot(A[i, :], B[:, j])
return C


def gemm_python_broadcast_numpy(alpha, A, B, beta, C):
return alpha * (A[:, None, :] * B.T[None, :, :]).sum(axis=2) + beta * C


def gemm_python_numpy_dot(alpha, A, B, beta, C):
C *= beta
C += alpha * np.dot(A, B)
return C


benchmarks = (
gemm_python_inner_numpy,
gemm_python_broadcast_numpy,
gemm_python_numpy_dot,
)
45 changes: 45 additions & 0 deletions gemm/gemm_theano.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Authors: James Bergstra
# License: MIT
import theano
import theano.tensor as TT


def gemm_theano_tensor_prepare(dtype):
alpha = TT.scalar(dtype=str(dtype))
beta = TT.scalar(dtype=str(dtype))
A = TT.matrix(dtype=str(dtype))
B = TT.matrix(dtype=str(dtype))
C = TT.matrix(dtype=str(dtype))
Z = alpha * TT.sum(A[:, None, :] * B, axis=2) + beta * C
name = 'gemm_theano_broadcast_' + dtype
Cin = theano.In(C, mutable=True, borrow=True)
rval = theano.function([alpha, A, B, beta, Cin],
theano.Out(Z, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


def gemm_theano_blas_prepare(dtype):
alpha = TT.scalar(dtype=str(dtype))
beta = TT.scalar(dtype=str(dtype))
A = TT.matrix(dtype=str(dtype))
B = TT.matrix(dtype=str(dtype))
C = TT.matrix(dtype=str(dtype))
Z = alpha * TT.dot(A, B) + beta * C
Cin = theano.In(C, mutable=True, borrow=True)
name = 'gemm_theano_blas_' + dtype
rval = theano.function([alpha, A, B, beta, Cin],
theano.Out(Z, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


benchmarks = (
#gemm_theano_tensor_prepare('float32'),
gemm_theano_tensor_prepare('float64'),
#gemm_theano_blas_prepare('float32'),
gemm_theano_blas_prepare('float64'),
)

10 changes: 10 additions & 0 deletions pairwise/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,16 @@
# Authors: Olivier Grisel
# License: MIT

"""Computes the Euclidean distance between each pair of rows in a matrix.

In LaTeX:
Y[i, j] = sqrt{ \sum_k (A[i, k] - B[j, k])^2 }

This computation is a core routine of many machine learning algorithms that
rely on neighbourhood computations.

"""

import numpy as np


Expand Down
104 changes: 8 additions & 96 deletions pairwise/pairwise_pyopencl.py
Original file line number Diff line number Diff line change
@@ -1,105 +1,17 @@
# Authors: James Bergstra
# Author: James Bergstra
# License: MIT

import numpy as np
import time
import pyopencl as cl
import numpy

mf = cl.mem_flags
# -- https://github.com/jaberg/python-benchmarks-pyopencl
from pybench_pyopencl import pairwise_pyopencl

PROFILING = 0

ctx = cl.create_some_context()
if PROFILING:
queue = cl.CommandQueue(
ctx,
properties=cl.command_queue_properties.PROFILING_ENABLE)
else:
queue = cl.CommandQueue(ctx)

_cache = {}

def pairwise_pyopencl_cpu_prepare(shp, dtype):
N, D = shp
ctype = {
'float32': 'float',
'float64': 'double',
}[str(dtype)]

odd_d = "" if 0 == D % 2 else """
__global %(ctype)s * a1 = (__global %(ctype)s*) (a);
%(ctype)s diff = a1[(n0 + 1) * %(D)s - 1] - a1[(m0 + 1) * %(D)s - 1];
buf.s0 += diff * diff;
"""

prg = cl.Program(ctx, """
__kernel void lower(__global %(ctype)s2 *a, __global %(ctype)s *c)
{
for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
{
for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
{
if (n0 < m0) continue;
__global %(ctype)s2 *an = a + n0 * %(D)s / 2;
__global %(ctype)s2 *am = a + m0 * %(D)s / 2;
%(ctype)s2 buf = 0;
for (int d = 0; d < %(D)s/2; ++d)
{
%(ctype)s2 diff = am[d] - an[d];
buf += diff * diff;
}
%(odd_d)s;
c[m0 * %(N)s + n0] = sqrt(buf.s0 + buf.s1);
}
}
}
__kernel void upper(__global %(ctype)s *a, __global %(ctype)s *c)
{
for(int n0 = get_global_id(0); n0 < %(N)s; n0 += get_global_size(0))
{
for(int m0 = get_global_id(1); m0 < %(N)s; m0 += get_global_size(1))
{
if (n0 >= m0) continue;
c[m0 * %(N)s + n0] = c[n0 * %(N)s + m0];
}
}
}
""" % locals()).build()

return prg.lower, prg.upper


comptimes = []
def pairwise_pyopencl_cpu(data):
data = np.asarray(data, order='C')
N, D = data.shape
try:
lower, upper = _cache[(data.shape, data.dtype)]
except:
lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype)
_cache[(data.shape, data.dtype)] = lower, upper
data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data)
dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize)
try:
rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ,
offset=0, shape=(N, N), dtype=data.dtype)
need_copy = False
except TypeError: #OSX's OCL needs this?
rval = np.empty((N, N), dtype=data.dtype)
need_copy = True
lower(queue, (N, 1), (1, 1), data_buf, dest_buf)
upper(queue, (4, 4), (1, 1), data_buf, dest_buf)
if need_copy:
cl.enqueue_copy(queue, rval, dest_buf)
else:
queue.finish()
if PROFILING:
comptimes.append(1e-9 * (ev.profile.end - ev.profile.start))
print 'computation time', min(comptimes)
return rval

M, K = data.shape
out = np.empty((M, M), dtype=data.dtype, order='C')
return pairwise_pyopencl.pairwise_pyopencl_cpu(data, data.T, out)

benchmarks = (
pairwise_pyopencl_cpu,
pairwise_pyopencl_cpu,
)

7 changes: 0 additions & 7 deletions pairwise/pairwise_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,8 @@ def pairwise_python_broadcast_numpy(data):
return np.sqrt(((data[:, None, :] - data) ** 2).sum(axis=2))


def pairwise_python_numpy_dot(data):
X_norm_2 = (data ** 2).sum(axis=1)
dists = np.sqrt(2 * X_norm_2 - np.dot(data, data.T))
return dists


benchmarks = (
pairwise_python_nested_for_loops,
pairwise_python_inner_numpy,
pairwise_python_broadcast_numpy,
pairwise_python_numpy_dot,
)
19 changes: 4 additions & 15 deletions pairwise/pairwise_theano.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,10 @@ def pairwise_theano_tensor_prepare(dtype):
return rval


def pairwise_theano_blas_prepare(dtype):
X = TT.matrix(dtype=str(dtype))
X_norm_2 = (X ** 2).sum(axis=1)
dists = TT.sqrt(2 * X_norm_2 - TT.dot(X, X.T))
name = 'pairwise_theano_blas_' + dtype
rval = theano.function([X],
theano.Out(dists, borrow=True),
allow_input_downcast=True, name=name)
rval.__name__ = name
return rval


benchmarks = (
pairwise_theano_tensor_prepare('float32'),
# -- disabling float32 to match the precision of the other
# implementations (assuming that the benchmark problem is
# to carry out computations in double precision).
# pairwise_theano_tensor_prepare('float32'),
pairwise_theano_tensor_prepare('float64'),
pairwise_theano_blas_prepare('float32'),
pairwise_theano_blas_prepare('float64'),
)
1 change: 1 addition & 0 deletions run_benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from collections import OrderedDict
except:
from ordereddict import OrderedDict
import argparse
import json
import os
import traceback
Expand Down