So, as the title says, I'm trying to eliminate false sharing, or, eliminate sharing writes between threads with TBB. The question is how.

Normally I'd make an array whose size is equal to the number of threads, then locally write to a local variable and update the array only at the end of the thread.

But, of course, I cannot seem to get either thread id or total number of threads TBB uses. I found a reference to tbb::enumerable_thread_specific, which as I understand, is supposed to work for exactly this. But as soon as I added it, it hurt performance by ~60% instead of making it better.

Code is below. Anyone more knowledgeable about TBB knows how to do this properly? You don't really need to look so hard on how the algorithm works (I don't know either). I know it's not quite right right now due to race conditions, but I'll fix that later. I used a reference implementation that I copied™, and my task is to parallelize it.

Oh, and the parts where the problem is right now is in red (of course, it's not all problems; it's only a subset of them).

Code:
#include <iostream>
#include <vector>
#include <fstream>
#include <ctime>
#include <algorithm>
#include <ppl.h>
#include <tbb/tbb.h>
#undef max

void usage(const char* app_name)
{
	std::cout << "Argument error! Usage: C++ AMP.exe <input file>\n";
	exit(0);
}

void clear(std::vector<int>& vec)
{
	std::fill(vec.begin(), vec.end(), 0);
}

int main(int argc, char** argv)
{
	if (argc != 2)
		usage(argv[0]);

	// Open files
	std::ifstream InputFile(argv[1]);
	if (!InputFile)
		usage(argv[0]);

	// Read the matrix
	int dim = 0;
	InputFile >> dim;
	std::vector<std::vector<int>> mat(dim, std::vector<int>(dim));

	for (decltype(dim) i = 0; i < dim; i++)
	for (decltype(dim) j = 0; j < dim; j++)
		InputFile >> mat[i][j];

	auto TimeStart = std::clock();

	std::vector<std::vector<int>> ps(dim, std::vector<int>(dim));

	// Compute vertical prefix sum
	//for (int j = 0; j < dim; j++)
	tbb::parallel_for(0, dim, [&mat, &ps, dim](int j)
	{
		ps[0][j] = mat[0][j];
		tbb::parallel_for(1, dim, [&mat, &ps, dim, j](int i)
		{
			ps[i][j] = ps[i - 1][j] + mat[i][j];
		});
		//for (int i = 1; i < dim; i++)
	});

	//Auxilliary variables 
	int max_sum = mat[0][0];
	int top = 0, left = 0, bottom = 0, right = 0;

	//tbb::parallel_for(0, dim, [&, dim](int i)
	for (int i = 0; i < dim; i++)
	{
		using thread_array_t = tbb::enumerable_thread_specific<int>;

		//thread_array_t local_max_sum = mat[0][0];
		//thread_array_t local_top = 0, local_left = 0, local_bottom = 0, local_right = 0;

		tbb::parallel_for(i, dim, [&, dim](int k)
		//for (int k = i; k < dim; k++)
		{
			// Kandane over all columns with the i..k rows
			//clear(sum);
			//clear(pos);
			std::vector<int> sum(dim);
			std::vector<int> pos(dim);
			int local_max = 0;

			// We keep track of the position of the max value over each Kandane's execution
			// Notice that we do not keep track of the max value, but only its position
			sum[0] = ps[k][0] - (i == 0 ? 0 : ps[i - 1][0]);

			const decltype(sum)& const_sum(sum);
			const decltype(pos)& const_pos(pos);
			const decltype(ps)& const_ps(ps);
			thread_array_t local_max_array = 0;

			tbb::parallel_for(1, dim, [&, dim](int j)
			//for (int j = 1; j<dim; j++)
			{
				int local_max_tmp = 0;
				if (sum[j - 1] > 0)
				{
					sum[j] = const_sum[j - 1] + const_ps[k][j] - (i == 0 ? 0 : const_ps[i - 1][j]);
					pos[j] = const_pos[j - 1];
				}
				else
				{
					sum[j] = const_ps[k][j] - (i == 0 ? 0 : const_ps[i - 1][j]);
					pos[j] = j;
				}
				if (const_sum[j] > const_sum[local_max])
					local_max_array.local() = j;
			}); //Kandane ends here

			for (const auto local_max_tmp : local_max_array)
				local_max = std::max(local_max, local_max_tmp);

			if (sum[local_max] > max_sum)
			{
				// sum[local_max] is the new max value
				// the corresponding submatrix goes from rows i..k.
				// and from columns pos[local_max]..local_max

				max_sum = sum[local_max];
				top = i;
				left = pos[local_max];
				bottom = k;
				right = local_max;
			}
		});
	}

	// Compose the output matrix
	int outmat_row_dim = bottom - top + 1;
	int outmat_col_dim = right - left + 1;
	std::vector<std::vector<int>> outmat(outmat_row_dim, std::vector<int>(outmat_col_dim));
	for (int i = top, k = 0; i <= bottom; i++, k++)
	for (int j = left, l = 0; j <= right; j++, l++)
		outmat[k][l] = mat[i][j];

	auto TimeEnd = std::clock();
	std::cout << "Took " << (TimeEnd - TimeStart) / (CLOCKS_PER_SEC / 1000) << " ms.\n";
}