[PyOpenCL] Interfacing PyOpenCL to clAmdBlas in Cython

Lars.Ericson at wellsfargo.com Lars.Ericson at wellsfargo.com
Wed Jun 17 10:00:42 EDT 2015


Hi,

I wrote a program which uses pyOpenCL to call clAmdBlas DGEMM .  This example is a recoding of example_sgemm.c in the clAmdBlas distribution, changed to use Python, Cython, NumPY, pyOpenCL and DGEMM.    The reason for doing this is to show how to access fast matrix multiplication which is also host-tunable, while at the same time be able to  mix with other pyOpenCL calls for example for simple addition of matrices and for random number generation.   I followed this route for two reasons:


1.       It is hard to find self-contained OpenCL kernels on the Internet for fast matrix multiplication for matrices of any dimension.  Most examples assume a dimension which happens to fit the gadget.   More pragmatic matrix multiplication codes use a C++ program to generate the kernel on the fly based on the dimensions.  This is what clAmdBlas does.  I didn't find any Python codes that replicate this.

2.       I couldn't figure out by reading the pyOpenCL documentation how to multiply two matrices.  I guessed I could do it with Reikna, but I wasn't convinced the result would be dimension-optimised.  I looked in the code for matrix multiplication in the pyOpenCL distribution and it looked at first glance like an element-wise multiply.

I am sharing this with PyOpenCL mailing list in case PyOpenCL folks want to incorporate wrappers for clAmdBlas and clAmdRnd (references below).  Here is the Cython header for clAmdBlasDGEMM:

from libc.stdint cimport intptr_t, uintptr_t

cdef extern from "clAmdBlas.h":
    enum:
        CL_SUCCESS                      = 0
        CL_INVALID_VALUE                = -30
        CL_INVALID_COMMAND_QUEUE        = -36
        CL_INVALID_CONTEXT              = -34
        CL_INVALID_MEM_OBJECT           = -38
        CL_INVALID_DEVICE               = -33
        CL_INVALID_EVENT_WAIT_LIST      = -57
        CL_OUT_OF_RESOURCES             = -5
        CL_OUT_OF_HOST_MEMORY           = -6
        CL_INVALID_OPERATION            = -59
        CL_COMPILER_NOT_AVAILABLE       = -3
        CL_BUILD_PROGRAM_FAILURE        = -11

    enum clAmdBlasStatus:
        clAmdBlasSuccess               = CL_SUCCESS
        clAmdBlasInvalidValue          = CL_INVALID_VALUE
        clAmdBlasInvalidCommandQueue   = CL_INVALID_COMMAND_QUEUE
        clAmdBlasInvalidContext        = CL_INVALID_CONTEXT
        clAmdBlasInvalidMemObject      = CL_INVALID_MEM_OBJECT
        clAmdBlasInvalidDevice         = CL_INVALID_DEVICE
        clAmdBlasInvalidEventWaitList  = CL_INVALID_EVENT_WAIT_LIST
        clAmdBlasOutOfResources        = CL_OUT_OF_RESOURCES
        clAmdBlasOutOfHostMemory       = CL_OUT_OF_HOST_MEMORY
        clAmdBlasInvalidOperation      = CL_INVALID_OPERATION
        clAmdBlasCompilerNotAvailable  = CL_COMPILER_NOT_AVAILABLE
        clAmdBlasBuildProgramFailure   = CL_BUILD_PROGRAM_FAILURE
        clAmdBlasNotImplemented        = -1024
        clAmdBlasNotInitialized        = -1023
        clAmdBlasInvalidMatA
        clAmdBlasInvalidMatB
        clAmdBlasInvalidMatC
        clAmdBlasInvalidVecX
        clAmdBlasInvalidVecY
        clAmdBlasInvalidDim
        clAmdBlasInvalidLeadDimA
        clAmdBlasInvalidLeadDimB
        clAmdBlasInvalidLeadDimC
        clAmdBlasInvalidIncX
        clAmdBlasInvalidIncY
        clAmdBlasInsufficientMemMatA
        clAmdBlasInsufficientMemMatB
        clAmdBlasInsufficientMemMatC
        clAmdBlasInsufficientMemVecX
        clAmdBlasInsufficientMemVecY

    enum clAmdBlasOrder:
        clAmdBlasRowMajor             = 0
        clAmdBlasColumnMajor          = 1

    enum clAmdBlasTranspose:
        clAmdBlasNoTrans             = 0
        clAmdBlasTrans               = 1
        clAmdBlasConjTrans           = 2

    ctypedef unsigned int cl_uint
    ctypedef signed int cl_int
    ctypedef float cl_float
    ctypedef double cl_double
    ctypedef void* cl_mem
    ctypedef void* cl_command_queue
    ctypedef void* cl_event
    ctypedef void* cl_platform_id
    ctypedef void* cl_device_id
    ctypedef void* cl_context

    ctypedef unsigned long cl_ulong
    ctypedef cl_ulong cl_bitfield
    ctypedef cl_bitfield cl_device_type

    cl_device_type CL_DEVICE_TYPE_GPU  = (1 << 2)

    ctypedef cl_bitfield cl_command_queue_properties

    ctypedef cl_uint cl_bool
    cl_bool CL_FALSE = 0
    cl_bool CL_TRUE = 1

    ctypedef cl_bitfield cl_mem_flags
    cl_mem_flags CL_MEM_READ_ONLY = (1 << 2)
    cl_mem_flags CL_MEM_READ_WRITE = (1 << 0)

    cl_int clGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms)
    cl_int clGetDeviceIDs(cl_platform_id platform, cl_device_type device_type,
                          cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices)

    ctypedef intptr_t cl_context_properties
    cl_context_properties CL_CONTEXT_PLATFORM = 0x1084

    cl_context clCreateContext(cl_context_properties *properties, cl_uint ndevices, cl_device_id *devices,
                               void *pfn_notify, void *user_data, cl_int *errcode_ret)
    cl_command_queue clCreateCommandQueue(cl_context context, cl_device_id device,
                                          cl_command_queue_properties properties, cl_int *errcode_ret)

    cl_int clEnqueueReadBuffer(cl_command_queue    command_queue,
                               cl_mem              buffer,
                               cl_bool             blocking_read,
                               size_t              offset,
                               size_t              size,
                               void *              ptr,
                               cl_uint             num_events_in_wait_list,
                               const cl_event *    event_wait_list,
                               cl_event *          event)

    cl_int clEnqueueWriteBuffer(cl_command_queue   command_queue,
                                cl_mem             buffer,
                                cl_bool            blocking_write,
                                size_t             offset,
                                size_t             size,
                                const void *       ptr,
                                cl_uint            num_events_in_wait_list,
                                const cl_event *   event_wait_list,
                                cl_event *         event)

    cl_int clReleaseContext(cl_context context)
    cl_int clReleaseCommandQueue(cl_command_queue command_queue)
    cl_mem clCreateBuffer(cl_context   context,
                                  cl_mem_flags flags,
                          size_t  size,
                          void *   host_ptr,
                          cl_int * errcode_ret)
    cl_int clWaitForEvents(cl_uint num_events, const cl_event * event_list)
    cl_int clReleaseMemObject(cl_mem memobj)

    clAmdBlasStatus clAmdBlasSetup( )
    void clAmdBlasTeardown( )
    clAmdBlasStatus clAmdBlasDgemm(clAmdBlasOrder order,
                                   clAmdBlasTranspose transA,
                                   clAmdBlasTranspose transB,
                                   size_t M,
                                   size_t N,
                                   size_t K,
                                   cl_double alpha,
                                   const cl_mem A,
                                   size_t lda,
                                   const cl_mem B,
                                   size_t ldb,
                                   cl_double beta,
                                   cl_mem C,
                                   size_t ldc,
                                   cl_uint numCommandQueues,
                                   cl_command_queue *commandQueues,
                                   cl_uint numEventsInWaitList,
                                   const cl_event *eventWaitList,
                                   cl_event *events)

Here is the main program:

import sys
from clAmdBlas cimport *
import ctypes
import numpy as np
cimport numpy as np
import pyopencl as cl
import pyopencl.array as cl_array

cdef class GPU:
    cdef cl_platform_id platform
    cdef cl_device_id device
    cdef cl_event event
    cdef cl_command_queue queue
    cdef cl_context ctx
    cdef cl_int err
    cdef public py_platform, py_device, py_context, py_queue

    def __dealloc__(self):
        clAmdBlasTeardown()
        print "clean up and shut down gadget"

    def __cinit__(self):
        self.event = NULL
        cdef cl_context_properties props[3]
        props[:] = [CL_CONTEXT_PLATFORM, 0, 0]

        self.py_platform = cl.get_platforms()[0]
        cdef intptr_t platform_p = self.py_platform.int_ptr
        self.platform = <cl_platform_id>platform_p

        self.py_device = self.py_platform.get_devices()[0]
        cdef intptr_t device_p = self.py_device.int_ptr
        self.device = <cl_device_id>device_p

        self.py_context = cl.Context(devices=[self.py_device])
        cdef intptr_t context_p = self.py_context.int_ptr
        self.ctx = <cl_context>context_p

        self.py_queue = cl.CommandQueue(self.py_context, device=self.py_device)
        cdef intptr_t queue_p = <intptr_t>self.py_queue.int_ptr
        self.queue = <cl_command_queue>queue_p

        self.err = clAmdBlasSetup()
        if self.err != CL_SUCCESS:
            raise ValueError("clAmdBlasSetup() failed with %d" % self.err)
        else:
            print "Blas setup"

DTYPE = np.float64
ctypedef np.float64_t DTYPE_t

def dgemm_test():
    gpu = GPU()

    cdef size_t M = 4
    cdef size_t N = 3
    cdef size_t K = 5

    cdef size_t A_size = M*K*sizeof(cl_double)
    a = np.ascontiguousarray([11,12,13,14,15,21,22,23,24,25,31,32,33,34,35,41,42,43,44,45], dtype=DTYPE)
    a_d = cl_array.to_device(gpu.py_queue, a)
    cdef intptr_t a_d_p = a_d.data.int_ptr
    cdef cl_mem bufA = <cl_mem> a_d_p
    print "A"

    cdef size_t B_size = K*N*sizeof(cl_double)
    b = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43,51,52,53], dtype=DTYPE)
    b_d = cl_array.to_device(gpu.py_queue, b)
    cdef intptr_t b_d_p = b_d.data.int_ptr
    cdef cl_mem bufB = <cl_mem> b_d_p
    print "B"

    cdef size_t C_size = M*N*sizeof(cl_double)
    c = np.ascontiguousarray([11,12,13,21,22,23,31,32,33,41,42,43], dtype=DTYPE)
    c_d = cl_array.to_device(gpu.py_queue, c)
    cdef intptr_t c_d_p = c_d.data.int_ptr
    cdef cl_mem bufC = <cl_mem> c_d_p
    print "C"

    cdef size_t result_size = C_size
    cdef np.ndarray[DTYPE_t, ndim=1] r = np.zeros(M*N, dtype=DTYPE)
    cdef DTYPE_t[:] r_view = r
    cdef cl_double *result = &r_view[0]
    print "r"

    cdef cl_double alpha = 10
    cdef cl_double beta = 20
    err = clAmdBlasDgemm(clAmdBlasRowMajor,clAmdBlasNoTrans,clAmdBlasNoTrans,M,N,K,alpha,bufA,K,bufB,N,beta,bufC,N,1,&gpu.queue,0,NULL,&gpu.event)
    if err != CL_SUCCESS:
        return "clAmdBlasDgemm() failed with %d" % err
    else:
        print "DGEMMed"
    err = clWaitForEvents(1, &gpu.event)
    err = clEnqueueReadBuffer(gpu.queue, bufC, CL_TRUE, 0, result_size, result, 0, NULL, NULL)

    return r

This then gets compiled to a Python module by running python example_dgemm_setup.py build_ext -inplace with this build definition file example_dgemm_setup.py:

from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
import numpy as np

extensions = [
    Extension(name = 'example_dgemm',
             sources = ['example_dgemm.pyx'],
             include_dirs = [ "C:\\Program Files (x86)\\AMD APP SDK\\2.9-1\\include",
                              "C:\\Program Files (x86)\\AMD\\clAmdBlas\\include",
                              np.get_include() ],
             library_dirs = ["C:\\Program Files (x86)\\AMD APP SDK\\2.9-1\\bin\\x86_64",
                             "c:\\Program Files (x86)\\AMD\\clAmdBlas\\bin64"],
             libraries=['clAmdBlas', 'OpenCL'])
]
extensions = cythonize(extensions)

setup(
    ext_modules = extensions
)

This can then be imported into a Python script like this one cytest.py:

import example_dgemm
r = example_dgemm.dgemm_test()
print "expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470 70640 72810"
print r

This can then be run with "python cytest.py", giving this output:

python dytest.py
Blas setup
A
B
C
r
DGEMMed
clean up and shut down gadget
expecting 21370 22040 22710 37070 38240 39410 52770 54440 56110 68470 70640 72810
[ 21370.  22040.  22710.  37070.  38240.  39410.  52770.  54440.  56110.
  68470.  70640.  72810.]

I was helped by looking at an initial wrapper from Kent Knox in the clMath distribution, plus a couple of other clues mentioned in references.  Note that clAmdBlasTune.exe is undocumented but mentioned here and there on the Internet.  The code above isn't long, but involves NumPy, OpenCL, pyOpenCL, Cython and MemoryViews, ctypes and libc.stdint.intptr_t, and about 10 cups of coffee.

References:


1.       NumPy and Ctypes

a.       https://sage.math.washington.edu:8091/hudson/job/cython-docs/doclinks/1/src/userguide/memoryviews.html

b.      https://docs.python.org/2/library/ctypes.html

c.       http://docs.scipy.org/doc/numpy/user/c-info.python-as-glue.html

2.       Cython

a.       http://cython.org/

b.      http://docs.cython.org/src/userguide/extension_types.html

3.       DGEMM and clMath and clAmdBlas

a.       http://www.math.utah.edu/software/lapack/lapack-blas/dgemm.html

b.      http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-math-libraries/

c.       C:\Program Files (x86)\AMD APP SDK\2.9-1\include\CL\cl.h

d.      c:/Program Files (x86)/AMD/clAmdBlas/samples/example_sgemm.c

e.      c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlas.dll

f.        c:/Program Files (x86)/AMD/clAmdBlas/bin64/clAmdBlasTune.exe

g.       C:\Program Files (x86)\AMD\clAmdBlas\include\clAmdBlas.h

h.      https://github.com/clMathLibraries/clBLAS/tree/master/src/wrappers/python (seems correct but complete example not provided)

i.         http://lists.tiker.net/pipermail/pyopencl/2014-February/001690.html (incomplete attempt, coredumps maybe due to mixing raw OpenCL and pyOpenCL deallocation)

j.        http://stackoverflow.com/questions/13396443/pyopencl-difference-between-to-device-and-buffer

k.       http://diffusionht.blogspot.com/2013/08/clamdblastune-ati-radeon-7970-linux.html

4.    clRNG (similarly, would be nice to have in pyOpenCL) http://developer.amd.com/community/blog/2015/05/05/amd-releases-clrng-an-opencl-random-number-library/



Thanks,

Lars Ericson

Quantitative Analytics Consultant
Market & Institutional Risk Management

Wells Fargo Bank, N.A. | 301 S. College St., 4th Floor | Charlotte, NC 28202-6000
MAC D1053-04X
Tel  704-410-2219 | Cell 917-891-1639

lars.ericson at wellsfargo.com<mailto:lars.ericson at wellsfargo.com>

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.tiker.net/pipermail/pyopencl/attachments/20150617/129a3f48/attachment-0001.html>


More information about the PyOpenCL mailing list