Btw, do you guys wanna guess what the Nvidia Visual Profiler tells me the biggest bottleneck is?
*drumroll*
It's this part!!!!
Code:
template<typename T>
struct tuple_comp
{
__host__ __device__
bool operator()(const thrust::tuple<T, T, T, T, T> t,
const thrust::tuple<T, T, T, T, T> v)
{
return ((unsigned& ) thrust::get<0>(t)) < ((unsigned& ) thrust::get<0>(v));
}
};
// Sort everything so that positives are up front
thrust::sort(thrust::make_zip_iterator(
thrust::make_tuple(thrust::device_ptr<int>(pa),
thrust::device_ptr<int>(ta),
thrust::device_ptr<int>(la),
thrust::device_ptr<int>(fs),
thrust::device_ptr<int>(nm))),
thrust::make_zip_iterator(
thrust::make_tuple(thrust::device_ptr<int>(pa + array_capacity),
thrust::device_ptr<int>(ta + array_capacity),
thrust::device_ptr<int>(la + array_capacity),
thrust::device_ptr<int>(fs + array_capacity),
thrust::device_ptr<int>(nm + array_capacity))),
tuple_comp<int>());