Commit 843c4896 authored by Ryan Gutenkunst's avatar Ryan Gutenkunst
Browse files

Add 4D integration CUDA code

parent 51d90205
......@@ -32,6 +32,27 @@ def _inject_mutations_3D_valcalc(dt, xx, yy, zz, theta0, frozen1, frozen2, froze
val001 = dt/zz[1] * theta0/2 * 8/((zz[2] - zz[0]) * xx[1] * yy[1])
return np.float64(val100), np.float64(val010), np.float64(val001)
def _inject_mutations_4D_valcalc(dt, xx, yy, zz, aa, theta0, frozen1, frozen2, frozen3, frozen4):
"""
Calculate mutations that need to be injected.
"""
# Population 1
# Normalization based on the multi-dimensional trapezoid rule is
# implemented ************** here ***************
val1000, val0100, val0010, val0001 = 0, 0, 0
if not frozen1:
val1000 = dt/xx[1] * theta0/2 * 16/((xx[2] - xx[0]) * yy[1] * zz[1] * aa[1])
# Population 2
if not frozen2:
val0100 = dt/yy[1] * theta0/2 * 16/((yy[2] - yy[0]) * xx[1] * zz[1] * aa[1])
# Population 3
if not frozen3:
val0010 = dt/zz[1] * theta0/2 * 16/((zz[2] - zz[0]) * xx[1] * yy[1] * aa[1])
# Population 4
if not frozen4:
val0001 = dt/aa[1] * theta0/2 * 16/((aa[2] - aa[0]) * xx[1] * yy[1] * zz[1])
return np.float64(val1000), np.float64(val0100), np.float64(val0010), np.float64(val0001)
import pycuda
import pycuda.gpuarray as gpuarray
from skcuda.cusparse import cusparseDgtsvInterleavedBatch_bufferSizeExt, cusparseDgtsvInterleavedBatch
......@@ -493,3 +514,207 @@ def _three_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f,
current_t += this_dt
return phi_gpu.get().reshape(L,M,N)
def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f,
m12_f, m13_f, m14_f, m21_f, m23_f, m24_f, m31_f, m32_f, m34_f,
m41_f, m42_f, m43_f, gamma1_f, gamma2_f, gamma3_f, gamma4_f,
h1_f, h2_f, h3_f, h4_f, theta0_f, frozen1, frozen2, frozen3, frozen4):
if dadi.Integration.use_delj_trick:
raise ValueError("delj trick not currently supported in CUDA execution")
t = current_t = initial_t
nu1, nu2, nu3, nu4 = nu1_f(current_t), nu2_f(current_t), nu3_f(current_t), nu4_f(current_t)
gamma1, gamma2, gamma3, gamma4 = gamma1_f(current_t), gamma2_f(current_t), gamma3_f(current_t), gamma4_f(current_t)
h1, h2, h3, h4 = h1_f(current_t), h2_f(current_t), h3_f(current_t), h4_f(current_t)
m12, m13, m14 = m12_f(current_t), m13_f(current_t), m14_f(current_t)
m21, m23, m24 = m21_f(current_t), m23_f(current_t), m24_f(current_t)
m31, m32, m34 = m31_f(current_t), m32_f(current_t), m34_f(current_t)
m41, m42, m43 = m41_f(current_t), m42_f(current_t), m43_f(current_t)
L = M = N = O = np.int32(len(xx))
phi_gpu = gpuarray.to_gpu(phi.reshape(L,M*N*O))
aa = yy = zz = xx
da = dx = dy = dz = np.diff(xx)
dfactor = dadi.Integration._compute_dfactor(dx)
xInt = (xx[:-1] + xx[1:])*0.5
xx_gpu = gpuarray.to_gpu(xx)
dx_gpu = gpuarray.to_gpu(dx)
dfactor_gpu = gpuarray.to_gpu(dfactor)
xInt_gpu = gpuarray.to_gpu(xInt)
V_gpu = gpuarray.empty(L, np.float64)
VInt_gpu = gpuarray.empty(L-1, np.float64)
a_gpu = gpuarray.empty((L,L*L), np.float64)
b_gpu = gpuarray.empty((L,L*L), np.float64)
c_gpu = gpuarray.empty((L,L*L), np.float64)
bsize_int = cusparseDgtsvInterleavedBatch_bufferSizeExt(
cusparse_handle, 0, L, a_gpu.gpudata, b_gpu.gpudata,
c_gpu.gpudata, phi_gpu.gpudata, L**2)
pBuffer = pycuda.driver.mem_alloc(bsize_int)
while current_t < T:
dt = min(dadi.Integration._compute_dt(dx, nu1, [m12, m13, m14], gamma1, h1),
dadi.Integration._compute_dt(dy, nu2, [m21, m23, m24], gamma2, h2),
dadi.Integration._compute_dt(dz, nu3, [m31, m32, m34], gamma3, h3),
dadi.Integration._compute_dt(dz, nu4, [m41, m42, m43], gamma4, h4))
this_dt = np.float64(min(dt, T - current_t))
next_t = current_t + this_dt
nu1, nu2, nu3, nu4 = nu1_f(next_t), nu2_f(next_t), nu3_f(next_t), nu4_f(next_t)
gamma1, gamma2, gamma3, gamma4 = gamma1_f(next_t), gamma2_f(next_t), gamma3_f(next_t), gamma4_f(next_t)
h1, h2, h3, h4 = h1_f(next_t), h2_f(next_t), h3_f(next_t), h4_f(next_t)
m12, m13, m14 = m12_f(next_t), m13_f(next_t), m14_f(next_t)
m21, m23, m24 = m21_f(next_t), m23_f(next_t), m24_f(next_t)
m31, m32, m34 = m31_f(next_t), m32_f(next_t), m34_f(next_t)
m41, m42, m43 = m41_f(next_t), m42_f(next_t), m43_f(next_t)
theta0 = theta0_f(next_t)
if np.any(np.less([T,nu1,nu2,nu3,nu4,m12,m13,m14,m21,m23,m24,m31,m32,m34,m41,m42,m43,theta0], 0)):
raise ValueError('A time, population size, migration rate, or '
'theta0 is < 0. Has the model been mis-specified?')
if np.any(np.equal([nu1,nu2,nu3,nu4], 0)):
raise ValueError('A population size is 0. Has the model been '
'mis-specified?')
val1000, val0100, val0010, val0001 = \
_inject_mutations_4D_valcalc(this_dt, xx, yy, zz, aa, theta0,
frozen1, frozen2, frozen3, frozen4)
kernels._inject_mutations_4D_vals(phi_gpu, L,
val0001, val0010, val0100, val1000, block=(1,1,1))
# I can use the c_gpu buffer for the MInt_gpu buffer, to save GPU memory.
# Note that I have to reassign this after each transpose operation I do.
MInt_gpu = c_gpu
if not frozen1:
kernels._Vfunc(xx_gpu, nu1, L, V_gpu,
grid=_grid(L), block=_block())
kernels._Vfunc(xInt_gpu, nu1, np.int32(L-1), VInt_gpu,
grid=_grid(L-1), block=_block())
kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m12, m13, m14, gamma1, h1,
np.int32(L-1), M, N, O, MInt_gpu,
grid=_grid((L-1)*M*N*O), block=_block())
b_gpu.fill(1./this_dt)
kernels._compute_ab_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, L, M*N*O,
a_gpu, b_gpu,
grid=_grid((L-1)*M*N*O), block=_block())
kernels._compute_bc_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, L, M*N*O,
b_gpu, c_gpu,
grid=_grid((L-1)*M*N*O), block=_block())
kernels._include_bc(dx_gpu, nu1, gamma1, h1, L, M*N*O,
b_gpu, block=(1,1,1))
kernels._cx0(c_gpu, L, M*N*O, grid=_grid(M*N*O), block=_block())
phi_gpu /= this_dt
cusparseDgtsvInterleavedBatch(cusparse_handle, 0, L,
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
M*N*O, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(M*N*O,L))
phi_gpu, c_gpu = c_gpu.reshape(M,L*N*O), phi_gpu.reshape(M,L*N*O)
MInt_gpu = c_gpu
if not frozen2:
kernels._Vfunc(xx_gpu, nu2, M, V_gpu,
grid=_grid(M), block=_block())
kernels._Vfunc(xInt_gpu, nu2, np.int32(M-1), VInt_gpu,
grid=_grid(M-1), block=_block())
# Note the order of the m arguments here.
kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m23, m24, m21, gamma2, h2,
np.int32(M-1), N, O, L, MInt_gpu,
grid=_grid((M-1)*L*N*O), block=_block())
b_gpu.fill(1./this_dt)
kernels._compute_ab_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, M, L*N*O,
a_gpu, b_gpu,
grid=_grid((M-1)*L*N*O), block=_block())
kernels._compute_bc_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, M, L*N*O,
b_gpu, c_gpu,
grid=_grid((M-1)*L*N*O), block=_block())
kernels._include_bc(dx_gpu, nu2, gamma2, h2, M, L*N*O,
b_gpu, block=(1,1,1))
kernels._cx0(c_gpu, M, L*N*O, grid=_grid(L*N*O), block=_block())
phi_gpu /= this_dt
cusparseDgtsvInterleavedBatch(cusparse_handle, 0, M,
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
L*N*O, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(L*N*O,M))
phi_gpu, c_gpu = c_gpu.reshape(N,L*M*O), phi_gpu.reshape(N,L*M*O)
MInt_gpu = c_gpu
if not frozen3:
kernels._Vfunc(xx_gpu, nu3, N, V_gpu,
grid=_grid(N), block=_block())
kernels._Vfunc(xInt_gpu, nu3, np.int32(N-1), VInt_gpu,
grid=_grid(N-1), block=_block())
kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m34, m31, m32, gamma3, h3,
np.int32(N-1), O, L, M, MInt_gpu,
grid=_grid((N-1)*M*L*O), block=_block())
b_gpu.fill(1./this_dt)
kernels._compute_ab_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, N, L*M*O,
a_gpu, b_gpu,
grid=_grid((N-1)*L*M*O), block=_block())
kernels._compute_bc_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, N, L*M*O,
b_gpu, c_gpu,
grid=_grid((N-1)*L*M*O), block=_block())
kernels._include_bc(dx_gpu, nu3, gamma3, h3, N, L*M*O,
b_gpu, block=(1,1,1))
kernels._cx0(c_gpu, N, L*M*O, grid=_grid(L*M*O), block=_block())
phi_gpu /= this_dt
cusparseDgtsvInterleavedBatch(cusparse_handle, 0, N,
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
L*M*O, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(L*N*O,M))
phi_gpu, c_gpu = c_gpu.reshape(O,L*M*N), phi_gpu.reshape(O,L*M*N)
MInt_gpu = c_gpu
if not frozen4:
kernels._Vfunc(xx_gpu, nu4, N, V_gpu,
grid=_grid(O), block=_block())
kernels._Vfunc(xInt_gpu, nu4, np.int32(O-1), VInt_gpu,
grid=_grid(O-1), block=_block())
kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m41, m42, m43, gamma4, h4,
np.int32(O-1), L, M, N, MInt_gpu,
grid=_grid((O-1)*M*L*N), block=_block())
b_gpu.fill(1./this_dt)
kernels._compute_ab_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, O, L*M*N,
a_gpu, b_gpu,
grid=_grid((O-1)*L*M*N), block=_block())
kernels._compute_bc_nobc(dx_gpu, dfactor_gpu,
MInt_gpu, V_gpu, this_dt, O, L*M*N,
b_gpu, c_gpu,
grid=_grid((O-1)*L*M*N), block=_block())
kernels._include_bc(dx_gpu, nu4, gamma4, h4, O, L*M*N,
b_gpu, block=(1,1,1))
kernels._cx0(c_gpu, O, L*M*N, grid=_grid(L*M*N), block=_block())
phi_gpu /= this_dt
cusparseDgtsvInterleavedBatch(cusparse_handle, 0, O,
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
L*M*N, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(M*N*O,L))
phi_gpu, c_gpu = c_gpu.reshape(L,M*N*O), phi_gpu.reshape(L,M*N*O)
current_t += this_dt
return phi_gpu.get().reshape(L,M,N,O)
......@@ -9,6 +9,13 @@ __global__ void inject_mutations_3D(double *phi, int L, double val001, double va
phi[L*L] += val100;
}
__global__ void inject_mutations_4D(double *phi, int L, double val0001, double val0010, double val0100, double val1000){
phi[1] += val0001;
phi[L] += val0010;
phi[L*L] += val0100;
phi[L*L*L] += val1000;
}
__global__ void Vfunc(double *x, double nu, int L, double *output){
int ii = blockIdx.x*blockDim.x + threadIdx.x;
if(ii < L){
......@@ -42,6 +49,21 @@ __global__ void Mfunc3D(double *x, double *y, double *z, double mxy, double mxz,
}
}
__device__ double _Mfunc4D(double x, double y, double z, double a, double mxy, double mxz, double mxa,
double gamma, double h){
return mxy * (y-x) + mxz * (z-x) + mxa * (a-x) + gamma * 2*(h + (1.-2*h)*x) * x*(1.-x);
}
__global__ void Mfunc4D(double *x, double *y, double *z, double *a, double mxy, double mxz, double mxa, double gamma, double h, int L, int M, int N, int O, double *output){
int ii = (blockIdx.x*blockDim.x + threadIdx.x) / (M*N*O);
int jj = ((blockIdx.x*blockDim.x + threadIdx.x) / (N*O)) % M;
int kk = ((blockIdx.x*blockDim.x + threadIdx.x) / O) % N;
int ll = (blockIdx.x*blockDim.x + threadIdx.x) % O;
if(ii < L){
output[ii*M*N*O + jj*N*O + kk*O + ll] = _Mfunc4D(x[ii], y[jj], z[kk], a[ll], mxy, mxz, mxa, gamma, h);
}
}
// We need an additional simple kernel to zero out the necessary
// values of the c array, because the Interleaved tridiagonal
// solver alters the c array.
......
......@@ -152,6 +152,43 @@ class CUDATestCase(unittest.TestCase):
frozen3)
self.assertTrue(np.allclose(phi_cpu, phi_gpu))
def test_4d_integration(self):
pts = 10
nu1 = lambda t: 0.5 + 5*t
nu2 = lambda t: 10-50*t
nu3, nu4 = 0.3, 0.9
m12, m13, m14 = 2.0, 0.1, 3.2
m21, m23, m24 = lambda t: 0.5+3*t, 0.2, 1.2
m31, m32, m34 = 0.9, 1.7, 0.9
m41, m42, m43 = 0.3, 0.4, 1.9
gamma1 = lambda t: -2*t
gamma2, gamma3, gamma4 = 3.0, -1, 0.5
h1 = lambda t: 0.2+t
h2 = lambda t: 0.9-t
h3, h4 = 0.3, 0.5
theta0 = lambda t: 1 + 2*t
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
phi = dadi.PhiManip.phi_1D_to_2D(xx, phi)
phi = dadi.PhiManip.phi_2D_to_3D(phi, 0, xx,xx,xx)
phi = dadi.PhiManip.phi_3D_to_4D(phi, 0, 0, xx,xx,xx,xx)
dadi.cuda_enabled(False)
phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0)
dadi.cuda_enabled(True)
phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0)
self.assertTrue(np.allclose(phi_cpu, phi_gpu))
if dadi.cuda_enabled(True):
suite = unittest.TestLoader().loadTestsFromTestCase(CUDATestCase)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment