Hey,

Background:
I've built a matrix multiplier in C based on the fortran DGEMM function.
It works great the only remaining modification I need to make is to add OpenMP pragmas so that it can run using multiple threads therefore maximising the potential of the CPU. OpenMP info here.

Does anybody know how where (and preferably why) I would add the open MP pragmas?
Code:
// START MULTIPLING!
if (notB == TRUE)
{	//printf("D ");
	if (notA == TRUE)
	{
		//C := alpha*A*B + beta*C >> == NORMAL
		#pragma omp parallel for
		for (j = 0; j < *n_cols; ++j) //col
		{
			if (*beta == 0.)
			{
				for(i = 0; i < *m_rows; ++i) //row
				{
					matrixC[i + j * *ldc] = 0.;
				}
			}
			else if (*beta != 1.)
			{
			for(i =0; i < *m_rows; ++i)//row
				{
			  		matrixC[i + j * *ldc] = *beta * matrixC[i + j * *ldc];
				}
			}
			//#pragma omp parallel for
			for (l = 0; l < *k_common; ++l)
			{	
				if (matrixB[l + j * *ldb] != 0.) 
				{
					temp = *alpha * matrixB[l + j * *ldb];
					for(i = 0; i < *m_rows; ++i)
					{
						matrixC[i + j * *ldc] += temp * matrixA[i + l * *lda];
					}
				}
			}
		}
	}
	else
	{
          //  C := alpha*A'*B + beta*C
		for (j = 0; j < *n_cols; ++j)
		{
			for (i = 0; i < *m_rows; ++i)
			{
				temp = 0;
				for (l = 0; l < *k_common; ++l)
				{
					temp += matrixA[l + i * *lda] * matrixB[l + j * *ldb];
				}
				if (*beta == 0)
				{
					matrixC[i + j * *ldc] = *alpha * temp;
				}
				else
				{	
					matrixC[i + j * *ldc] = *alpha * temp + *beta * matrixC[i + j * *ldc];
				}
			}
		}
	}
}
else 
{
	if (notA == TRUE)
	{
	//Form  C := alpha*A*B' + beta*C
		for (j = 0; j < *n_cols; ++j)
		{
			if (*beta == 0.)
			{
				for (i = 0; i < *m_rows; ++i)
				{
					matrixC[i + j * *ldc] = 0.;
				}
			}
			else if(*beta != 1.)
			{
				for (i = 0; i < *m_rows; ++i) 
				{
				matrixC[i + j * *ldc] = *beta * matrixC[i + j * *ldc];
				}
			}
			for(l=0; l<*k_common; ++l)
			{
				if ((matrixB[l * *ldb + j]) != 0.)						
				{
					temp = *alpha * matrixB[j + l * *ldb];
					for(i = 0; i < *m_rows; ++i)
					{
					matrixC[i + j * *ldc] += temp * matrixA[i + l * *lda];
					}
				}
			}
		}
	}
	else
	{
		//Form  C := alpha*A'*B' + beta*C
		for(j = 0; j < *n_cols; ++j)
		{
			for(i = 0; i < *m_rows; ++i)
			{
				temp = 0;
				for(l=0; l < *k_common; ++l)
				{
					temp += matrixA[l + i * *lda] * matrixB[j + l * *ldb];
				}
				if( *beta == 0.)
				{
					matrixC[i + j * *ldc] = *alpha * temp;
				}
				else
				{
					matrixC[i + j * *ldc] = *alpha * temp + *beta * matrixC[i + j * *ldc];
				}
			}
		}
	}
}
Anytime I add the pragmas and set num threads to 2 it just seems to slow it down. I don't really care if it makes it faster but i didn't think it would get worse.

This may not be the best place to post this so please, if you know a better place please let me know. 10days till it needs to be handed in... i'm screwed.

Thanks,

Colly.