#ifndef B3_RADIXSORT32_H #define B3_RADIXSORT32_H #include "b3OpenCLArray.h" struct b3SortData { union { unsigned int m_key; unsigned int x; }; union { unsigned int m_value; unsigned int y; }; }; #include "b3BufferInfoCL.h" class b3RadixSort32CL { b3OpenCLArray* m_workBuffer1; b3OpenCLArray* m_workBuffer2; b3OpenCLArray* m_workBuffer3; b3OpenCLArray* m_workBuffer4; b3OpenCLArray* m_workBuffer3a; b3OpenCLArray* m_workBuffer4a; cl_command_queue m_commandQueue; cl_kernel m_streamCountSortDataKernel; cl_kernel m_streamCountKernel; cl_kernel m_prefixScanKernel; cl_kernel m_sortAndScatterSortDataKernel; cl_kernel m_sortAndScatterKernel; bool m_deviceCPU; class b3PrefixScanCL* m_scan; class b3FillCL* m_fill; public: struct b3ConstData { int m_n; int m_nWGs; int m_startBit; int m_nBlocksPerWG; }; enum { DATA_ALIGNMENT = 256, WG_SIZE = 64, BLOCK_SIZE = 256, ELEMENTS_PER_WORK_ITEM = (BLOCK_SIZE / WG_SIZE), BITS_PER_PASS = 4, NUM_BUCKET = (1 << BITS_PER_PASS), // if you change this, change nPerWI in kernel as well NUM_WGS = 20 * 6, // cypress // NUM_WGS = 24*6, // cayman // NUM_WGS = 32*4, // nv }; private: public: b3RadixSort32CL(cl_context ctx, cl_device_id device, cl_command_queue queue, int initialCapacity = 0); virtual ~b3RadixSort32CL(); void execute(b3OpenCLArray& keysIn, b3OpenCLArray& keysOut, b3OpenCLArray& valuesIn, b3OpenCLArray& valuesOut, int n, int sortBits = 32); ///keys only void execute(b3OpenCLArray& keysInOut, int sortBits = 32); void execute(b3OpenCLArray& keyValuesInOut, int sortBits = 32); void executeHost(b3OpenCLArray& keyValuesInOut, int sortBits = 32); void executeHost(b3AlignedObjectArray& keyValuesInOut, int sortBits = 32); }; #endif //B3_RADIXSORT32_H