Skip to content

Commit

Permalink
[Fix] Use OpenCL images for rotating grids on GPU.
Browse files Browse the repository at this point in the history
  • Loading branch information
Gydo committed Sep 6, 2016
1 parent a26d69f commit 290012d
Show file tree
Hide file tree
Showing 3 changed files with 306 additions and 40 deletions.
96 changes: 84 additions & 12 deletions powerfit/kernels.cl
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#define SQUARE(a) ((a) * (a))
#define IMAGE_OFFSET 0.5f

// To be defined on compile time
#define SHAPE_X $shape_x
Expand All @@ -17,7 +18,7 @@ void rotate_grid3d(
)
{
// Rotate grid around the origin. Only grid points within LLENGTH of the
// origin are rotated. Nearest neighbour interpolation.
// origin are rotated.

int zid = get_global_id(0);
int yid = get_global_id(1);
Expand All @@ -28,17 +29,14 @@ void rotate_grid3d(

int z, y, x, x0, y0, z0, x1, y1, z1, offset0, offset1, grid_ind;
float dx, dy, dz, dx1, dy1, dz1, c00, c10, c01, c11, c0, c1, c;
float3 dist2, coor_z, coor_zy, coor_zyx;
int3 out_ind;

float4 dist2, coor_z, coor_zy, coor_zyx;
int4 out_ind;

for (z = zid - LLENGTH; z <= LLENGTH; z += zstride) {
dist2.s2 = SQUARE(z);
if (dist2.s2 > LLENGTH2)
continue;

coor_z.s0 = rotmat.s2 * z;
coor_z.s1 = rotmat.s5 * z;
coor_z.s0 = rotmat.s6 * z;
coor_z.s1 = rotmat.s7 * z;
coor_z.s2 = rotmat.s8 * z;

out_ind.s0 = z * SLICE;
Expand All @@ -50,9 +48,9 @@ void rotate_grid3d(
if (dist2.s1 > LLENGTH2)
continue;

coor_zy.s0 = rotmat.s1 * y + coor_z.s0;
coor_zy.s0 = rotmat.s3 * y + coor_z.s0;
coor_zy.s1 = rotmat.s4 * y + coor_z.s1;
coor_zy.s2 = rotmat.s7 * y + coor_z.s2;
coor_zy.s2 = rotmat.s5 * y + coor_z.s2;

out_ind.s1 = out_ind.s0 + y * SHAPE_X;
if (y < 0)
Expand All @@ -63,8 +61,8 @@ void rotate_grid3d(
if (dist2.s0 > LLENGTH2)
continue;
coor_zyx.s0 = rotmat.s0 * x + coor_zy.s0;
coor_zyx.s1 = rotmat.s3 * x + coor_zy.s1;
coor_zyx.s2 = rotmat.s6 * x + coor_zy.s2;
coor_zyx.s1 = rotmat.s1 * x + coor_zy.s1;
coor_zyx.s2 = rotmat.s2 * x + coor_zy.s2;

out_ind.s2 = out_ind.s1 + x;
if (x < 0)
Expand Down Expand Up @@ -103,6 +101,13 @@ void rotate_grid3d(
if (z0 < 0)
grid_ind += SIZE;

dx = coor_zyx.s0 - x0;
dy = coor_zyx.s1 - y0;
dz = coor_zyx.s2 - z0;
dx1 = 1 - dx;
dy1 = 1 - dx;
dz1 = 1 - dx;

offset1 = 1;
if (x1 == 0)
offset1 -= SHAPE_X;
Expand Down Expand Up @@ -149,3 +154,70 @@ void rotate_grid3d(
}
}
}


kernel
void rotate_image3d(
read_only image3d_t image, sampler_t sampler, float16 rotmat,
global float *out
)
{
// Rotate grid around the origin. Only grid points within LLENGTH of the
// origin are rotated.

int zid = get_global_id(0);
int yid = get_global_id(1);
int xid = get_global_id(2);
int zstride = get_global_size(0);
int ystride = get_global_size(1);
int xstride = get_global_size(2);

int z, y, x;
float4 dist2, coor_z, coor_zy, coor_zyx, fshape;
int4 out_ind;
fshape.s2 = (float) SHAPE_X;
fshape.s1 = (float) SHAPE_Y;
fshape.s0 = (float) SHAPE_Z;

for (z = zid - LLENGTH; z <= LLENGTH; z += zstride) {
dist2.s2 = SQUARE(z);

coor_z.s0 = rotmat.s6 * z + IMAGE_OFFSET;
coor_z.s1 = rotmat.s7 * z + IMAGE_OFFSET;
coor_z.s2 = rotmat.s8 * z + IMAGE_OFFSET;

out_ind.s0 = z * SLICE;
if (z < 0)
out_ind.s0 += SIZE;

for (y = yid - LLENGTH; y <= LLENGTH; y += ystride) {
dist2.s1 = SQUARE(y) + dist2.s2;
if (dist2.s1 > LLENGTH2)
continue;

coor_zy.s0 = rotmat.s3 * y + coor_z.s0;
coor_zy.s1 = rotmat.s4 * y + coor_z.s1;
coor_zy.s2 = rotmat.s5 * y + coor_z.s2;

out_ind.s1 = out_ind.s0 + y * SHAPE_X;
if (y < 0)
out_ind.s1 += SLICE;

for (x = xid - LLENGTH; x <= LLENGTH; x += xstride) {
dist2.s0 = SQUARE(x) + dist2.s1;
if (dist2.s0 > LLENGTH2)
continue;
// Normalize coordinates
coor_zyx.s0 = (rotmat.s0 * x + coor_zy.s0) / fshape.s2;
coor_zyx.s1 = (rotmat.s1 * x + coor_zy.s1) / fshape.s1;
coor_zyx.s2 = (rotmat.s2 * x + coor_zy.s2) / fshape.s0;

out_ind.s2 = out_ind.s1 + x;
if (x < 0)
out_ind.s2 += SHAPE_X;

out[out_ind.s2] = read_imagef(image, sampler, coor_zyx).s0;
}
}
}
}
81 changes: 53 additions & 28 deletions powerfit/powerfitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,6 @@ def mask(self, mask):
self._normalize_template(ind)
# multiply again for core-weighted correlation score
self._template *= self._mask
# calculate the maximum radius

@staticmethod
def _laplace_filter(array):
Expand Down Expand Up @@ -411,7 +410,8 @@ def __init__(self, target, queue, laplace=False):
target = self._laplace_filter(self._target)
# move some arrays to the GPU
self._gtarget = cl_array.to_device(self._queue, target.astype(np.float32))
self._lcc_mask = cl_array.to_device(self._queue, self._lcc_mask.astype(np.int32))
self._lcc_mask = cl_array.to_device(self._queue,
self._lcc_mask.astype(np.int32))
# Do some one-time precalculations
self._rfftn(self._gtarget, self._ft_target)
self._k.multiply(self._gtarget, self._gtarget, self._target2)
Expand Down Expand Up @@ -466,12 +466,10 @@ def mask(self, mask):
BaseCorrelator.mask.fset(self, mask)
self._norm_factor = np.float32(self._norm_factor)
self._rmax = np.int32(self._rmax)
self._gtemplate = cl_array.to_device(
self._queue, self._template.astype(np.float32)
)
self._gmask = cl_array.to_device(
self._queue, self._mask.astype(np.float32)
)
self._gtemplate = cl.image_from_array(self._queue.context,
self._template.astype(np.float32))
self._gmask = cl.image_from_array(self._queue.context,
self._mask.astype(np.float32))

@property
def rotations(self):
Expand All @@ -480,9 +478,36 @@ def rotations(self):
@rotations.setter
def rotations(self, rotations):
BaseCorrelator.rotations.fset(self, rotations)
self._cl_rotations = np.zeros((self._rotations.shape[0], 16), dtype=np.float32)
self._cl_rotations = np.zeros((self._rotations.shape[0], 16),
dtype=np.float32)
self._cl_rotations[:, :9] = self._rotations.reshape(-1, 9)

def _cl_rotate_grids(self, rotmat):
self._k.rotate_image3d(self._queue, self._gtemplate, rotmat,
self._rot_template)
self._k.rotate_image3d(self._queue, self._gmask, rotmat,
self._rot_mask, nearest=True)
self._queue.finish()

def _cl_get_gcc(self):
self._rfftn(self._rot_template, self._ft_template)
self._k.conj_multiply(self._ft_template, self._ft_target, self._ft_gcc)
self._irfftn(self._ft_gcc, self._gcc)
self._queue.finish()

def _cl_get_ave(self):
self._rfftn(self._rot_mask, self._ft_mask)
self._k.conj_multiply(self._ft_mask, self._ft_target, self._ft_ave)
self._irfftn(self._ft_ave, self._ave)
self._queue.finish()

def _cl_get_ave2(self):
self._k.multiply(self._rot_mask, self._rot_mask, self._rot_mask2)
self._rfftn(self._rot_mask2, self._ft_mask2)
self._k.conj_multiply(self._ft_mask2, self._ft_target2, self._ft_ave2)
self._irfftn(self._ft_ave2, self._ave2)
self._queue.finish()

def scan(self):
super(GPUCorrelator, self).scan()

Expand All @@ -493,24 +518,16 @@ def scan(self):

rotmat = self._cl_rotations[n]

self._k.rotate_grid3d(self._queue, self._gtemplate, rotmat, self._rot_template)
self._k.rotate_grid3d(self._queue, self._gmask, rotmat,
self._rot_mask, nearest=True)
self._k.multiply(self._rot_mask, self._rot_mask, self._rot_mask2)
self._cl_rotate_grids(rotmat)

self._rfftn(self._rot_template, self._ft_template)
self._rfftn(self._rot_mask, self._ft_mask)
self._rfftn(self._rot_mask2, self._ft_mask2)
self._cl_get_gcc()
self._cl_get_ave()
self._cl_get_ave2()

self._k.conj_multiply(self._ft_template, self._ft_target, self._ft_gcc)
self._k.conj_multiply(self._ft_mask, self._ft_target, self._ft_ave)
self._k.conj_multiply(self._ft_mask2, self._ft_target2, self._ft_ave2)
self._k.calc_lcc_and_take_best(self._gcc, self._ave,
self._ave2, self._lcc_mask, self._norm_factor,
np.int32(n), self._glcc, self._grot)

self._irfftn(self._ft_gcc, self._gcc)
self._irfftn(self._ft_ave, self._ave)
self._irfftn(self._ft_ave2, self._ave2)
self._k.calc_lcc_and_take_best(self._gcc, self._ave, self._ave2, self._lcc_mask,
self._norm_factor, np.int32(n), self._glcc, self._grot)
self._queue.finish()

self._print_progress(n, self._rotations.shape[0], time0)
Expand Down Expand Up @@ -538,6 +555,10 @@ def _generate_kernels(self):

class CLKernels(object):
def __init__(self, ctx, values):
self.sampler_nearest = cl.Sampler(ctx, True,
cl.addressing_mode.REPEAT, cl.filter_mode.NEAREST)
self.sampler_linear = cl.Sampler(ctx, True,
cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR)
self.multiply = ElementwiseKernel(ctx,
"float *x, float *y, float *z",
"z[i] = x[i] * y[i];"
Expand Down Expand Up @@ -568,12 +589,16 @@ def __init__(self, ctx, values):
self._gws_rotate_grid3d = (96, 64, 1)

def rotate_grid3d(self, queue, grid, rotmat, out, nearest=False):
_nearest = np.int32(0)
if nearest:
_nearest = np.int32(1)
args = (grid.data, rotmat, out.data, _nearest)
args = (grid.data, rotmat, out.data, np.int32(nearest))
self._program.rotate_grid3d(queue, self._gws_rotate_grid3d, None, *args)

def rotate_image3d(self, queue, image, rotmat, out, nearest=False):
if nearest:
args = (image, self.sampler_nearest, rotmat, out.data)
else:
args = (image, self.sampler_linear, rotmat, out.data)
self._program.rotate_image3d(queue, self._gws_rotate_grid3d, None, *args)


class grfftn_builder(object):
_G = GpyFFT()
Expand Down
Loading

0 comments on commit 290012d

Please sign in to comment.