Greetings.

I have tried to de-bug the code below for many hours now and have reached a point where I can no longer try any other solutions and I thought I would try to get some fresh eyes on this issue. The code is a simple iterative solver (Gauss-Seidel method) which decomposes the domain for parallel processing by dividing the rows equally. Right now it is hanging up and does not seem to move to the iterative portion of the code. The primary MPI commands are quite simple and are only meant to pass overlap rows either up or down. When I tried to place a tracer to find the hang up, it stops at "check5." Any thoughts? Thanks much.

Code:
#include <cstdlib>
#include <iostream>
#include <iomanip>
#include <cmath>
#include <fstream> 
#include <sstream>
#include <stdio.h>
#include <math.h>
#include "mpi.h"


MPI_Status status;
 
using namespace std; 


//Define constants - Pi and grid dimensions


#define pi_double 3.1415926535897932
#define Lx  2.0E+6
#define Ly  2.0E+6
 
double bodyforce(double yy)
{
  double alpha =1.e-9; 
  double cor = pi_double*yy/Ly;
  double fa = -alpha*sin(cor);
  return fa;
}


int main ( int argc, char *argv[] ){


int i,j,nx,ny,nodes,localstart,localend,localrows,decomp;


  MPI::Init ( argc, argv );


  int myrank = MPI::COMM_WORLD.Get_rank ( );


  int p = MPI::COMM_WORLD.Get_size ( );
  
  if (myrank == 0){
             // int nx and ny defining number of points in x and y directions.


cout << " Enter desired number of subdivision in x and y (square grid). " << endl;
cout << " This number of subdivisions must be divisible by " << endl;
cout << " the number of processors used in the simulation. " << endl;


// Read in number of divisions
cin >>nx ;
}


double wtime;


if ( myrank == 0 ) {
    wtime = MPI::Wtime ( );
}


ny = nx;


nodes = nx;


decomp = nodes/p;


//Allocate space for the global phi array


double phi[nx][ny];


//Establish a value which allocates the proper memory for the
//localphi array which will be dealt with on each individual processor
//This number should be an integer


localrows = decomp+2;


//The philocal array allocates the proper memory for the localphi
//arrays


double philocal[localrows][ny];


double x[nx];


double y[ny];


double dc,eps=1.0E-7;
const double r=3.0E-6,b=2.25E-11; 


double dx= Lx/(nx-1);
double dy= Ly/(ny-1); 


double coef = dx*dx*dy*dy;


double Ap= -2.0*r*(coef/(dx*dx)+coef/(dy*dy));


double Ae= -r*coef/(dx*dx) - b*coef/(2*dx);
double a1 =   Ae/Ap;


double Aw= -r*coef/(dx*dx) + b*coef/(2*dx);
double a2 =   Aw/Ap;


double An= -r*coef/(dy*dy);
double a3 =   An/Ap;


double As= -r*coef/(dy*dy); 
double a4 =   As/Ap;


double a5 =   coef/Ap;


for(i=0;i<nx;i++) {
      x[i]=dx*i;
} 


for(j=0;j<ny;j++){
      y[j]=dy*j;
} 


for(i=0;i<nx;i++) {
  for(j=0;j<ny;j++) {
        if ((i==0)||(j==0)||(i==nx-1)||(j==ny-1)){ 
          phi[i][j]=0.0;
        } else{
              phi[i][j]=1.5;
        }
  }
}


//Send initial guesses for phi to local phi arrays on separate processors




MPI_Scatter( phi[0], nodes * (decomp), MPI_DOUBLE, philocal[1], nodes * (decomp), MPI_DOUBLE, 0, MPI_COMM_WORLD );




//Establish bounds for evaluation on the local arrays


localstart = 1;
localend = nodes/p;


//The evaluation bounds for the upper and lower localphi arrays
//have to be adjusted since they do not require the same number of
//ghost points


if(myrank==0){
    localstart++;
}


if(myrank==(p-1)){
    localend--;
}


double dc2;
double gdc2;
double prev;
int iter=0;
do{


//These MPI commands will send the top row of data for each local array up to
//the local array directly above it unless it is the top local array in which 
//case it does not send data above it
cout << " check1" << endl;
if (myrank < p - 1) 
for(j=0;j<ny;j++) {
                    cout << " check2 " << endl;
        MPI_Send(&philocal[nodes/p][j], nodes, MPI_DOUBLE, myrank + 1, 0, 
              MPI_COMM_WORLD );
              }
                                
    if (myrank > 0)
    for(j=0;j<ny;j++) {
                        cout << " check3 " << endl;
        MPI_Recv(&philocal[0][j], nodes, MPI_DOUBLE, myrank - 1, 0, 
              MPI_COMM_WORLD, &status );
              }
    /* Send down unless I'm at the bottom */


    if (myrank > 0) 
    for(j=0;j<ny;j++) {
                        cout << " check4 " << endl;
        MPI_Send(&philocal[1][j], nodes, MPI_DOUBLE, myrank - 1, 1, 
              MPI_COMM_WORLD );
              }
    if (myrank < p - 1) 
    for(j=0;j<ny;j++) {
                        cout << " check5 " << endl;
        MPI_Recv( &philocal[(nodes/p)+1][j], nodes, MPI_DOUBLE, myrank + 1, 1, 
              MPI_COMM_WORLD, &status );
              }


    prev=0;
    dc=0; 
    iter = iter+1;
     for(i=localstart;i<localend;i++){
       for(j=1;j<nodes-1;j++) {
        prev = philocal[i][j];
          double ay = y[j];
          cout << iter << endl;
                 philocal[i][j] = a1*philocal[i+1][j]+a2*philocal[i-1][j]+a3*philocal[i][j+1]+a4*philocal[i][j-1]+a5*bodyforce(ay);
          dc = dc+abs(philocal[i][j] - prev); 
       }
     }
   dc2 = dc/((nx-1)*(ny-1));


MPI_Allreduce(&dc2,&gdc2,1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);


} while (gdc2>eps || iter<75);


MPI_Gather(philocal[1],nodes*(nodes/p),MPI_DOUBLE,phi,nodes*(nodes/p),MPI_DOUBLE,0,MPI_COMM_WORLD);


  ofstream outf2;
  outf2.open("WTJFinalResults.dat");


  outf2 << "TITLE=\"stommel problem\"" << endl;


  outf2 << "VARIABLES=\"X\" \"Y\" \"Phi\" " <<endl;


  outf2 << "ZONE T=\"Floor\", I=" << ny <<" J=" <<nx <<" F=POINT " <<endl;


  for ( i=0;i<nx;i++) {
     for ( j=0;j<ny;j++) {
        outf2 << x[i] <<" " << y[j] << " " <<phi[i][j]<<endl;
     }
  }


  outf2.close();




if (myrank == 0) {
    wtime = MPI::Wtime ( ) - wtime;


    cout << "  Wall clock elapsed seconds = " << wtime << endl;      
}


MPI_Finalize();


return 0;


}