Hello there,

For a while now I've been working on a program that, at its core, uses the method of relaxation to find the transverse electric field in a cross section of some metal cylindrical cavity with arbitrary boundary conditions (shape). That's quite a mouthful.

The algorithm, as you may know, involves calculating the laplacian of a large 2D array and subtracting it, along with a scale factor, from each point. This must be done in two sweeps, since the new value of each point depends on its four adjacent points. A running calculation gives the wrong result. And, of course, you do this millions of times.

My code is terribly slow, but I don't see how I could expect much better then O(n) for this algorithm.So I'm wondering how this algorithm can be split, as to take advantage of multi-core processors.Any ideas? Here's some code:

Please, don't mind the code dump. It's not important.

Code:`//The high-level look at the relaxation`

//g is of type 'grid_t' -- essentially a 2D array

//reg is of type 'region', which specifies a subset of a grid

std::vector<double> lap(grid_t::num_points(reg)); //laplacian field (as a 1D array)

for(unsigned int t = 0; t < time_steps; ++t)

{

g.iterate_region(reg,functions::calc_lap(lap)); //feeds laplacian into lap

g.iterate_region(reg,functions::relax(lap,dt)); //adds dt*lap[] to each point

}

Code:`//The grid object type`

class grid_t

{

public:

grid_t(void) : w(0), h(0) {}

grid_t(int w_in, int h_in, double fill = 0.0);

~grid_t(void);

struct span {

span(unsigned int x_in,unsigned int y_in,int dy_in) : x(x_in), y(y_in), dy(dy_in) {}

unsigned int x, y;

int dy;

};

typedef std::vector<span> region;

double & operator()(int x, int y);

bool iterate_grid(func2d & f);

bool iterate_region(region & reg, func2d & f);

region find_regions(double boundary = 0.0);

bool write_file(const std::string & path);

unsigned int const width() { return w; }

unsigned int const height(){ return h; }

unsigned long int const num_points() { return w*h; } //number of points in grid

static unsigned long int num_points(const region & r); //number of POINTS in r (not members, which are spans).

private:

int w, h;

std::vector<std::vector<double> > g;

};

Code:`//The virtual function object`

//Surprisingly, the virtual calls are optimized out (they don't depend on input)

class func2d

{

public:

func2d() {}

~func2d() {}

virtual double operator()(int x, int y, grid_t& g) = 0;

};

namespace functions

{

class relax : public func2d

{

public:

relax(std::vector<double> & lap_ref, double dt_in) : lap(lap_ref), dt(dt_in), count(0), func2d() {}

~relax() {}

double operator()(int x, int y, grid_t & g)

{

g(x,y) = g(x,y) + dt*lap[count++];

return g(x,y);

}

private:

std::vector<double> & lap;

const double dt;

unsigned long int count;

};

class calc_lap : public func2d

{

public:

calc_lap(std::vector<double> & lap_ref) : lap(lap_ref), count(0), func2d() {}

~calc_lap() {}

double operator()(int x, int y, grid_t & g)

{

lap[count++] = ( g(x-1,y) + g(x+1,y) + g(x,y-1) + g(x,y+1) - 4*g(x,y) );

return lap[count-1];

}

private:

std::vector<double> & lap;

unsigned long int count;

};

//......... More in namespace........

Code:`bool grid_t::iterate_region(region & reg, func2d & f)`

{

for(unsigned int i = 0; i < reg.size(); ++i) //over all spans

{

for(int j = 0; j < reg[i].dy; ++j) //for each span

{

f(reg[i].x, reg[i].y + j, *this);

}

}

return true;

}