Alright, I kind of brute-forced it. I'm not sure if it was highlighted in my OP (I did try to keep it hidden because I just wanted theoretical discussion of the algorithm) but this code is for a GPU so it's alright to use a butt-load of threads.
I used a much more niche hardware feature from nVidia but I hope the code translates into what I eventually settled with doing :
Code:
#include <stdio.h>
#include <assert.h>
enum color { white, blue, red };
struct pair
{
int first, second;
__host__ __device__
pair(int a, int b) : first(a), second(b) { };
};
struct args
{
pair *pr;
int *color; // color table
bool *tbl; // hash table
int *anchor;
};
__global__
void child(pair *pr, int *color, bool *tbl, int *anchor)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
if (i < 6)
{
if (color[i] == blue || color[i] == red)
return;
if (tbl[pr[i].first] || tbl[pr[i].second]) // check hash table
{
//printf("%d, %d\n", pr[i].first, pr[i].second);
color[i] = blue;
return;
}
atomicMin(anchor, i);
}
}
// Parent GPU routine
__global__
void parent(args data)
{
pair *pr = data.pr;
int *color = data.color;
bool *tbl = data.tbl;
int *anchor = data.anchor;
do
{
// set hash table values
tbl[pr[*anchor].first] = 1;
tbl[pr[*anchor].second] = 1;
color[*anchor] = red; // red = unique pair
*anchor = INT_MAX; // for atomicMin()
// launch the child routine
// this is only possible on certain architectures
child<<<1, 6>>>(pr, color, tbl, anchor);
cudaDeviceSynchronize();
//printf("New anchor %d\n", *anchor);
} while (*anchor != INT_MAX);
for (int i = 0; i < 6; ++i)
if (color[i] == red)
printf("%d, %d\n", pr[i].first, pr[i].second);
}
int main(void)
{
pair *data = 0;
int *color = 0;
bool *tbl = 0;
int *anchor = 0;
cudaMallocManaged(&data, 6 * sizeof(*data));
cudaMallocManaged(&color, 6 * sizeof(*color));
cudaMallocManaged(&tbl, 6 * sizeof(*tbl));
cudaMallocManaged(&anchor, sizeof(*anchor));
// Build pair data by hand (super lazy, direct method)
(data + 0)->first = 0;
(data + 0)->second = 1;
(data + 1)->first = 0;
(data + 1)->second = 5;
(data + 2)->first = 1;
(data + 2)->second = 2;
(data + 3)->first = 2;
(data + 3)->second = 3;
(data + 4)->first = 3;
(data + 4)->second = 4;
(data + 5)->first = 4;
(data + 5)->second = 5;
// everything starts out white (default value)
for (int i = 0; i < 6; ++i)
color[i] = white;
// the hash table is initially all false (empty) and the anchor
// is initialized to 0
cudaMemset(tbl, 0, 6 * sizeof(*tbl));
*anchor = 0;
// build argument data structure
args dta;
dta.pr = data;
dta.tbl = tbl;
dta.color = color;
dta.anchor = anchor;
// this calls the GPU code. We are launching it with 1 thread.
// in CUDA, threads are launched in blocks of threads.
// here we use 1 block with 1 thread
parent<<<1, 1>>>(dta);
cudaDeviceSynchronize(); // sync to GPU
// clean-up
cudaFree(anchor);
cudaFree(tbl);
cudaFree(color);
cudaFree(data);
return 0;
}
Basically, we color all unique pairs red and all non-unique ones blue. We define red pairs by where the next anchor lands (minimum index of currently non-red and non-blue indices, i.e. is still white).