Commit a693e7dc authored by Ryan Gutenkunst's avatar Ryan Gutenkunst
Browse files

Complete 4D GPU integration

parent 410bb61b
......@@ -118,7 +118,7 @@ def _inject_mutations_4D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, froz
phi[0,0,0,1] += dt/aa[1] * theta0/2 * 16/((aa[2] - aa[0]) * xx[1] * yy[1] * zz[1])
return phi
def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, frozen3, frozen4):
def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, bb, theta0, frozen1, frozen2, frozen3, frozen4, frozen5):
"""
Inject novel mutations for a timestep.
"""
......@@ -136,6 +136,7 @@ def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, froz
# Population 4
if not frozen4:
phi[0,0,0,1,0] += dt/aa[1] * theta0/2 * 32/((aa[2] - aa[0]) * xx[1] * yy[1] * zz[1] * bb[1])
# Population 5
if not frozen5:
phi[0,0,0,0,1] += dt/aa[1] * theta0/2 * 32/((bb[2] - bb[0]) * xx[1] * yy[1] * zz[1] * aa[1])
return phi
......@@ -736,7 +737,7 @@ def five_pops(phi, xx, T, nu1=1, nu2=1, nu3=1, nu4=1, nu5=1,
_compute_dt(dy,nu2,[m21,m23,m24,m25],gamma2,h2),
_compute_dt(dz,nu3,[m31,m32,m34,m35],gamma3,h3),
_compute_dt(da,nu4,[m41,m42,m43,m45],gamma4,h4),
_compute_dt(da,nu4,[m51,m52,m53,m54],gamma5,h5))
_compute_dt(db,nu5,[m51,m52,m53,m54],gamma5,h5))
this_dt = min(dt, T - current_t)
next_t = current_t + this_dt
......
......@@ -39,7 +39,7 @@ def _inject_mutations_4D_valcalc(dt, xx, yy, zz, aa, theta0, frozen1, frozen2, f
# Population 1
# Normalization based on the multi-dimensional trapezoid rule is
# implemented ************** here ***************
val1000, val0100, val0010, val0001 = 0, 0, 0
val1000, val0100, val0010, val0001 = 0, 0, 0, 0
if not frozen1:
val1000 = dt/xx[1] * theta0/2 * 16/((xx[2] - xx[0]) * yy[1] * zz[1] * aa[1])
# Population 2
......@@ -548,20 +548,20 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f
V_gpu = gpuarray.empty(L, np.float64)
VInt_gpu = gpuarray.empty(L-1, np.float64)
a_gpu = gpuarray.empty((L,L*L), np.float64)
b_gpu = gpuarray.empty((L,L*L), np.float64)
c_gpu = gpuarray.empty((L,L*L), np.float64)
a_gpu = gpuarray.empty((L,M*N*O), np.float64)
b_gpu = gpuarray.empty((L,M*N*O), np.float64)
c_gpu = gpuarray.empty((L,M*N*O), np.float64)
bsize_int = cusparseDgtsvInterleavedBatch_bufferSizeExt(
cusparse_handle, 0, L, a_gpu.gpudata, b_gpu.gpudata,
c_gpu.gpudata, phi_gpu.gpudata, L**2)
c_gpu.gpudata, phi_gpu.gpudata, M*N*O)
pBuffer = pycuda.driver.mem_alloc(bsize_int)
while current_t < T:
dt = min(dadi.Integration._compute_dt(dx, nu1, [m12, m13, m14], gamma1, h1),
dadi.Integration._compute_dt(dy, nu2, [m21, m23, m24], gamma2, h2),
dadi.Integration._compute_dt(dz, nu3, [m31, m32, m34], gamma3, h3),
dadi.Integration._compute_dt(dz, nu4, [m41, m42, m43], gamma4, h4))
dadi.Integration._compute_dt(da, nu4, [m41, m42, m43], gamma4, h4))
this_dt = np.float64(min(dt, T - current_t))
next_t = current_t + this_dt
......@@ -658,7 +658,7 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f
grid=_grid(N), block=_block())
kernels._Vfunc(xInt_gpu, nu3, np.int32(N-1), VInt_gpu,
grid=_grid(N-1), block=_block())
kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m34, m31, m32, gamma3, h3,
kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m34, m31, m32, gamma3, h3,
np.int32(N-1), O, L, M, MInt_gpu,
grid=_grid((N-1)*M*L*O), block=_block())
......@@ -681,15 +681,15 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
L*M*O, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(L*N*O,M))
transpose_gpuarray(phi_gpu, c_gpu.reshape(L*M*O,N))
phi_gpu, c_gpu = c_gpu.reshape(O,L*M*N), phi_gpu.reshape(O,L*M*N)
MInt_gpu = c_gpu
if not frozen4:
kernels._Vfunc(xx_gpu, nu4, N, V_gpu,
kernels._Vfunc(xx_gpu, nu4, O, V_gpu,
grid=_grid(O), block=_block())
kernels._Vfunc(xInt_gpu, nu4, np.int32(O-1), VInt_gpu,
grid=_grid(O-1), block=_block())
kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m41, m42, m43, gamma4, h4,
kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m41, m42, m43, gamma4, h4,
np.int32(O-1), L, M, N, MInt_gpu,
grid=_grid((O-1)*M*L*N), block=_block())
......@@ -712,7 +712,7 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f
a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata,
L*M*N, pBuffer)
transpose_gpuarray(phi_gpu, c_gpu.reshape(M*N*O,L))
transpose_gpuarray(phi_gpu, c_gpu.reshape(L*M*N,O))
phi_gpu, c_gpu = c_gpu.reshape(L,M*N*O), phi_gpu.reshape(L,M*N*O)
current_t += this_dt
......
......@@ -7,9 +7,11 @@ mod = SourceModule(open(sourcefile).read())
_inject_mutations_2D_vals = mod.get_function("inject_mutations_2D")
_inject_mutations_3D_vals = mod.get_function("inject_mutations_3D")
_inject_mutations_4D_vals = mod.get_function("inject_mutations_4D")
_Vfunc = mod.get_function("Vfunc")
_Mfunc2D = mod.get_function("Mfunc2D")
_Mfunc3D = mod.get_function("Mfunc3D")
_Mfunc4D = mod.get_function("Mfunc4D")
_cx0 = mod.get_function("cx0")
_compute_ab_nobc = mod.get_function("compute_ab_nobc")
_compute_bc_nobc = mod.get_function("compute_bc_nobc")
......
......@@ -63,7 +63,7 @@ void implicit_4Dx(double *phi, double *xx, double *yy, double *zz, double *aa,
if((yy[jj]==0) && (zz[kk]==0) && (aa[ll]==0) && (Mfirst <= 0))
b[0] += (0.5/nu1 - Mfirst)*2./dx[0];
if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==0) && (Mlast >= 0))
if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==1) && (Mlast >= 0))
b[L-1] += -(-0.5/nu1 - Mlast)*2./dx[L-2];
tridiag_premalloc(a, b, c, r, temp, L);
......@@ -142,7 +142,7 @@ void implicit_4Dy(double *phi, double *xx, double *yy, double *zz, double *aa,
if((xx[ii]==0) && (zz[kk]==0) && (aa[ll]==0) && (Mfirst <= 0))
b[0] += (0.5/nu2 - Mfirst)*2./dy[0];
if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==0) && (Mlast >= 0))
if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==1) && (Mlast >= 0))
b[M-1] += -(-0.5/nu2 - Mlast)*2./dy[M-2];
tridiag_premalloc(a, b, c, r, temp, M);
......@@ -221,7 +221,7 @@ void implicit_4Dz(double *phi, double *xx, double *yy, double *zz, double *aa,
if((xx[ii]==0) && (yy[jj]==0) && (aa[ll] == 0) && (Mfirst <= 0))
b[0] += (0.5/nu3 - Mfirst)*2./dz[0];
if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 0) && (Mlast >= 0))
if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 1) && (Mlast >= 0))
b[N-1] += -(-0.5/nu3 - Mlast)*2./dz[N-2];
tridiag_premalloc(a, b, c, r, temp, N);
......@@ -299,7 +299,7 @@ void implicit_4Da(double *phi, double *xx, double *yy, double *zz, double *aa,
if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (Mfirst <= 0))
b[0] += (0.5/nu4 - Mfirst)*2./da[0];
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (Mlast >= 0))
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (Mlast >= 0))
b[O-1] += -(-0.5/nu4 - Mlast)*2./da[O-2];
tridiag_premalloc(a, b, c, r, &phi[ii*M*N*O + jj*N*O + kk*O], O);
......
......@@ -65,7 +65,7 @@ void implicit_5Dx(double *phi, double *xx, double *yy, double *zz, double *aa, d
if((yy[jj]==0) && (zz[kk]==0) && (aa[ll]==0) && (bb[mm]==0) && (Mfirst <= 0))
b[0] += (0.5/nu1 - Mfirst)*2./dx[0];
if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==0) && (bb[mm]==0) && (Mlast >= 0))
if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==1) && (bb[mm]==1) && (Mlast >= 0))
b[L-1] += -(-0.5/nu1 - Mlast)*2./dx[L-2];
tridiag_premalloc(a, b, c, r, temp, L);
......@@ -147,7 +147,7 @@ void implicit_5Dy(double *phi, double *xx, double *yy, double *zz, double *aa, d
if((xx[ii]==0) && (zz[kk]==0) && (aa[ll]==0) && (bb[mm]==0) && (Mfirst <= 0))
b[0] += (0.5/nu2 - Mfirst)*2./dy[0];
if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==0) && (bb[mm]==0) && (Mlast >= 0))
if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==1) && (bb[mm]==1) && (Mlast >= 0))
b[M-1] += -(-0.5/nu2 - Mlast)*2./dy[M-2];
tridiag_premalloc(a, b, c, r, temp, M);
......@@ -229,7 +229,7 @@ void implicit_5Dz(double *phi, double *xx, double *yy, double *zz, double *aa, d
if((xx[ii]==0) && (yy[jj]==0) && (aa[ll] == 0) && (bb[mm]==0) && (Mfirst <= 0))
b[0] += (0.5/nu3 - Mfirst)*2./dz[0];
if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 0) && (bb[mm]==0) && (Mlast >= 0))
if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 1) && (bb[mm]==1) && (Mlast >= 0))
b[N-1] += -(-0.5/nu3 - Mlast)*2./dz[N-2];
tridiag_premalloc(a, b, c, r, temp, N);
......@@ -311,7 +311,7 @@ void implicit_5Da(double *phi, double *xx, double *yy, double *zz, double *aa, d
if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (bb[mm] == 0) && (Mfirst <= 0))
b[0] += (0.5/nu4 - Mfirst)*2./da[0];
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (bb[mm] == 0) && (Mlast >= 0))
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (bb[mm] == 1) && (Mlast >= 0))
b[O-1] += -(-0.5/nu4 - Mlast)*2./da[O-2];
tridiag_premalloc(a, b, c, r, temp, O);
......@@ -392,7 +392,7 @@ void implicit_5Db(double *phi, double *xx, double *yy, double *zz, double *aa, d
if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (aa[ll] == 0) && (Mfirst <= 0))
b[0] += (0.5/nu5 - Mfirst)*2./db[0];
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (aa[ll] == 0) && (Mlast >= 0))
if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (aa[ll] == 1) && (Mlast >= 0))
b[P-1] += -(-0.5/nu5 - Mlast)*2./db[O-2];
tridiag_premalloc(a, b, c, r, &phi[ii*M*N*O*P + jj*N*O*P + kk*O*P + ll], P);
......
......@@ -156,7 +156,7 @@ class CUDATestCase(unittest.TestCase):
def test_4d_integration(self):
pts = 10
nu1 = lambda t: 0.5 + 5*t
nu2 = lambda t: 10-50*t
nu2 = lambda t: 10-20*t
nu3, nu4 = 0.3, 0.9
m12, m13, m14 = 2.0, 0.1, 3.2
m21, m23, m24 = lambda t: 0.5+3*t, 0.2, 1.2
......@@ -168,6 +168,8 @@ class CUDATestCase(unittest.TestCase):
h2 = lambda t: 0.9-t
h3, h4 = 0.3, 0.5
theta0 = lambda t: 1 + 2*t
f1,f2,f3,f4 = False, False, False, False
T = 0.1
xx = dadi.Numerics.default_grid(pts)
phi = dadi.PhiManip.phi_1D(xx)
......@@ -175,20 +177,42 @@ class CUDATestCase(unittest.TestCase):
phi = dadi.PhiManip.phi_2D_to_3D(phi, 0, xx,xx,xx)
phi = dadi.PhiManip.phi_3D_to_4D(phi, 0, 0, xx,xx,xx,xx)
dadi.cuda_enabled(True)
phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0,
frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4)
dadi.cuda_enabled(False)
phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0)
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0,
frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4)
self.assertTrue(np.allclose(phi_cpu, phi_gpu))
m12, m13, m14, m21, m23, m24, m31, m32, m34, m41, m42, m43 = [0]*12
for f1 in [True, False]:
for f2 in [True, False]:
for f3 in [True, False]:
for f4 in [True, False]:
dadi.cuda_enabled(True)
phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0)
self.assertTrue(np.allclose(phi_cpu, phi_gpu))
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0,
frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4)
dadi.cuda_enabled(False)
phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4,
m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24,
m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43,
gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4,
h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0,
frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4)
if dadi.cuda_enabled(True):
suite = unittest.TestLoader().loadTestsFromTestCase(CUDATestCase)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment