Commit a693e7dc by Ryan Gutenkunst

### Complete 4D GPU integration

parent 410bb61b
 ... ... @@ -118,7 +118,7 @@ def _inject_mutations_4D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, froz phi[0,0,0,1] += dt/aa[1] * theta0/2 * 16/((aa[2] - aa[0]) * xx[1] * yy[1] * zz[1]) return phi def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, frozen3, frozen4): def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, bb, theta0, frozen1, frozen2, frozen3, frozen4, frozen5): """ Inject novel mutations for a timestep. """ ... ... @@ -136,6 +136,7 @@ def _inject_mutations_5D(phi, dt, xx, yy, zz, aa, theta0, frozen1, frozen2, froz # Population 4 if not frozen4: phi[0,0,0,1,0] += dt/aa[1] * theta0/2 * 32/((aa[2] - aa[0]) * xx[1] * yy[1] * zz[1] * bb[1]) # Population 5 if not frozen5: phi[0,0,0,0,1] += dt/aa[1] * theta0/2 * 32/((bb[2] - bb[0]) * xx[1] * yy[1] * zz[1] * aa[1]) return phi ... ... @@ -736,7 +737,7 @@ def five_pops(phi, xx, T, nu1=1, nu2=1, nu3=1, nu4=1, nu5=1, _compute_dt(dy,nu2,[m21,m23,m24,m25],gamma2,h2), _compute_dt(dz,nu3,[m31,m32,m34,m35],gamma3,h3), _compute_dt(da,nu4,[m41,m42,m43,m45],gamma4,h4), _compute_dt(da,nu4,[m51,m52,m53,m54],gamma5,h5)) _compute_dt(db,nu5,[m51,m52,m53,m54],gamma5,h5)) this_dt = min(dt, T - current_t) next_t = current_t + this_dt ... ...
 ... ... @@ -39,7 +39,7 @@ def _inject_mutations_4D_valcalc(dt, xx, yy, zz, aa, theta0, frozen1, frozen2, f # Population 1 # Normalization based on the multi-dimensional trapezoid rule is # implemented ************** here *************** val1000, val0100, val0010, val0001 = 0, 0, 0 val1000, val0100, val0010, val0001 = 0, 0, 0, 0 if not frozen1: val1000 = dt/xx[1] * theta0/2 * 16/((xx[2] - xx[0]) * yy[1] * zz[1] * aa[1]) # Population 2 ... ... @@ -548,20 +548,20 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f V_gpu = gpuarray.empty(L, np.float64) VInt_gpu = gpuarray.empty(L-1, np.float64) a_gpu = gpuarray.empty((L,L*L), np.float64) b_gpu = gpuarray.empty((L,L*L), np.float64) c_gpu = gpuarray.empty((L,L*L), np.float64) a_gpu = gpuarray.empty((L,M*N*O), np.float64) b_gpu = gpuarray.empty((L,M*N*O), np.float64) c_gpu = gpuarray.empty((L,M*N*O), np.float64) bsize_int = cusparseDgtsvInterleavedBatch_bufferSizeExt( cusparse_handle, 0, L, a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata, L**2) c_gpu.gpudata, phi_gpu.gpudata, M*N*O) pBuffer = pycuda.driver.mem_alloc(bsize_int) while current_t < T: dt = min(dadi.Integration._compute_dt(dx, nu1, [m12, m13, m14], gamma1, h1), dadi.Integration._compute_dt(dy, nu2, [m21, m23, m24], gamma2, h2), dadi.Integration._compute_dt(dz, nu3, [m31, m32, m34], gamma3, h3), dadi.Integration._compute_dt(dz, nu4, [m41, m42, m43], gamma4, h4)) dadi.Integration._compute_dt(da, nu4, [m41, m42, m43], gamma4, h4)) this_dt = np.float64(min(dt, T - current_t)) next_t = current_t + this_dt ... ... @@ -658,7 +658,7 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f grid=_grid(N), block=_block()) kernels._Vfunc(xInt_gpu, nu3, np.int32(N-1), VInt_gpu, grid=_grid(N-1), block=_block()) kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m34, m31, m32, gamma3, h3, kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m34, m31, m32, gamma3, h3, np.int32(N-1), O, L, M, MInt_gpu, grid=_grid((N-1)*M*L*O), block=_block()) ... ... @@ -681,15 +681,15 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata, L*M*O, pBuffer) transpose_gpuarray(phi_gpu, c_gpu.reshape(L*N*O,M)) transpose_gpuarray(phi_gpu, c_gpu.reshape(L*M*O,N)) phi_gpu, c_gpu = c_gpu.reshape(O,L*M*N), phi_gpu.reshape(O,L*M*N) MInt_gpu = c_gpu if not frozen4: kernels._Vfunc(xx_gpu, nu4, N, V_gpu, kernels._Vfunc(xx_gpu, nu4, O, V_gpu, grid=_grid(O), block=_block()) kernels._Vfunc(xInt_gpu, nu4, np.int32(O-1), VInt_gpu, grid=_grid(O-1), block=_block()) kernels._Mfunc3D(xInt_gpu, xx_gpu, xx_gpu, m41, m42, m43, gamma4, h4, kernels._Mfunc4D(xInt_gpu, xx_gpu, xx_gpu, xx_gpu, m41, m42, m43, gamma4, h4, np.int32(O-1), L, M, N, MInt_gpu, grid=_grid((O-1)*M*L*N), block=_block()) ... ... @@ -712,7 +712,7 @@ def _four_pops_temporal_params(phi, xx, T, initial_t, nu1_f, nu2_f, nu3_f, nu4_f a_gpu.gpudata, b_gpu.gpudata, c_gpu.gpudata, phi_gpu.gpudata, L*M*N, pBuffer) transpose_gpuarray(phi_gpu, c_gpu.reshape(M*N*O,L)) transpose_gpuarray(phi_gpu, c_gpu.reshape(L*M*N,O)) phi_gpu, c_gpu = c_gpu.reshape(L,M*N*O), phi_gpu.reshape(L,M*N*O) current_t += this_dt ... ...
 ... ... @@ -7,9 +7,11 @@ mod = SourceModule(open(sourcefile).read()) _inject_mutations_2D_vals = mod.get_function("inject_mutations_2D") _inject_mutations_3D_vals = mod.get_function("inject_mutations_3D") _inject_mutations_4D_vals = mod.get_function("inject_mutations_4D") _Vfunc = mod.get_function("Vfunc") _Mfunc2D = mod.get_function("Mfunc2D") _Mfunc3D = mod.get_function("Mfunc3D") _Mfunc4D = mod.get_function("Mfunc4D") _cx0 = mod.get_function("cx0") _compute_ab_nobc = mod.get_function("compute_ab_nobc") _compute_bc_nobc = mod.get_function("compute_bc_nobc") ... ...
 ... ... @@ -63,7 +63,7 @@ void implicit_4Dx(double *phi, double *xx, double *yy, double *zz, double *aa, if((yy[jj]==0) && (zz[kk]==0) && (aa[ll]==0) && (Mfirst <= 0)) b[0] += (0.5/nu1 - Mfirst)*2./dx[0]; if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==0) && (Mlast >= 0)) if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==1) && (Mlast >= 0)) b[L-1] += -(-0.5/nu1 - Mlast)*2./dx[L-2]; tridiag_premalloc(a, b, c, r, temp, L); ... ... @@ -142,7 +142,7 @@ void implicit_4Dy(double *phi, double *xx, double *yy, double *zz, double *aa, if((xx[ii]==0) && (zz[kk]==0) && (aa[ll]==0) && (Mfirst <= 0)) b[0] += (0.5/nu2 - Mfirst)*2./dy[0]; if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==0) && (Mlast >= 0)) if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==1) && (Mlast >= 0)) b[M-1] += -(-0.5/nu2 - Mlast)*2./dy[M-2]; tridiag_premalloc(a, b, c, r, temp, M); ... ... @@ -221,7 +221,7 @@ void implicit_4Dz(double *phi, double *xx, double *yy, double *zz, double *aa, if((xx[ii]==0) && (yy[jj]==0) && (aa[ll] == 0) && (Mfirst <= 0)) b[0] += (0.5/nu3 - Mfirst)*2./dz[0]; if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 0) && (Mlast >= 0)) if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 1) && (Mlast >= 0)) b[N-1] += -(-0.5/nu3 - Mlast)*2./dz[N-2]; tridiag_premalloc(a, b, c, r, temp, N); ... ... @@ -299,7 +299,7 @@ void implicit_4Da(double *phi, double *xx, double *yy, double *zz, double *aa, if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (Mfirst <= 0)) b[0] += (0.5/nu4 - Mfirst)*2./da[0]; if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (Mlast >= 0)) if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (Mlast >= 0)) b[O-1] += -(-0.5/nu4 - Mlast)*2./da[O-2]; tridiag_premalloc(a, b, c, r, &phi[ii*M*N*O + jj*N*O + kk*O], O); ... ...
 ... ... @@ -65,7 +65,7 @@ void implicit_5Dx(double *phi, double *xx, double *yy, double *zz, double *aa, d if((yy[jj]==0) && (zz[kk]==0) && (aa[ll]==0) && (bb[mm]==0) && (Mfirst <= 0)) b[0] += (0.5/nu1 - Mfirst)*2./dx[0]; if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==0) && (bb[mm]==0) && (Mlast >= 0)) if((yy[jj]==1) && (zz[kk]==1) && (aa[ll]==1) && (bb[mm]==1) && (Mlast >= 0)) b[L-1] += -(-0.5/nu1 - Mlast)*2./dx[L-2]; tridiag_premalloc(a, b, c, r, temp, L); ... ... @@ -147,7 +147,7 @@ void implicit_5Dy(double *phi, double *xx, double *yy, double *zz, double *aa, d if((xx[ii]==0) && (zz[kk]==0) && (aa[ll]==0) && (bb[mm]==0) && (Mfirst <= 0)) b[0] += (0.5/nu2 - Mfirst)*2./dy[0]; if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==0) && (bb[mm]==0) && (Mlast >= 0)) if((xx[ii]==1) && (zz[kk]==1) && (aa[ll]==1) && (bb[mm]==1) && (Mlast >= 0)) b[M-1] += -(-0.5/nu2 - Mlast)*2./dy[M-2]; tridiag_premalloc(a, b, c, r, temp, M); ... ... @@ -229,7 +229,7 @@ void implicit_5Dz(double *phi, double *xx, double *yy, double *zz, double *aa, d if((xx[ii]==0) && (yy[jj]==0) && (aa[ll] == 0) && (bb[mm]==0) && (Mfirst <= 0)) b[0] += (0.5/nu3 - Mfirst)*2./dz[0]; if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 0) && (bb[mm]==0) && (Mlast >= 0)) if((xx[ii]==1) && (yy[jj]==1) && (aa[ll] == 1) && (bb[mm]==1) && (Mlast >= 0)) b[N-1] += -(-0.5/nu3 - Mlast)*2./dz[N-2]; tridiag_premalloc(a, b, c, r, temp, N); ... ... @@ -311,7 +311,7 @@ void implicit_5Da(double *phi, double *xx, double *yy, double *zz, double *aa, d if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (bb[mm] == 0) && (Mfirst <= 0)) b[0] += (0.5/nu4 - Mfirst)*2./da[0]; if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (bb[mm] == 0) && (Mlast >= 0)) if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (bb[mm] == 1) && (Mlast >= 0)) b[O-1] += -(-0.5/nu4 - Mlast)*2./da[O-2]; tridiag_premalloc(a, b, c, r, temp, O); ... ... @@ -392,7 +392,7 @@ void implicit_5Db(double *phi, double *xx, double *yy, double *zz, double *aa, d if((xx[ii]==0) && (yy[jj]==0) && (zz[kk] == 0) && (aa[ll] == 0) && (Mfirst <= 0)) b[0] += (0.5/nu5 - Mfirst)*2./db[0]; if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 0) && (aa[ll] == 0) && (Mlast >= 0)) if((xx[ii]==1) && (yy[jj]==1) && (zz[kk] == 1) && (aa[ll] == 1) && (Mlast >= 0)) b[P-1] += -(-0.5/nu5 - Mlast)*2./db[O-2]; tridiag_premalloc(a, b, c, r, &phi[ii*M*N*O*P + jj*N*O*P + kk*O*P + ll], P); ... ...
 ... ... @@ -156,7 +156,7 @@ class CUDATestCase(unittest.TestCase): def test_4d_integration(self): pts = 10 nu1 = lambda t: 0.5 + 5*t nu2 = lambda t: 10-50*t nu2 = lambda t: 10-20*t nu3, nu4 = 0.3, 0.9 m12, m13, m14 = 2.0, 0.1, 3.2 m21, m23, m24 = lambda t: 0.5+3*t, 0.2, 1.2 ... ... @@ -168,6 +168,8 @@ class CUDATestCase(unittest.TestCase): h2 = lambda t: 0.9-t h3, h4 = 0.3, 0.5 theta0 = lambda t: 1 + 2*t f1,f2,f3,f4 = False, False, False, False T = 0.1 xx = dadi.Numerics.default_grid(pts) phi = dadi.PhiManip.phi_1D(xx) ... ... @@ -175,21 +177,43 @@ class CUDATestCase(unittest.TestCase): phi = dadi.PhiManip.phi_2D_to_3D(phi, 0, xx,xx,xx) phi = dadi.PhiManip.phi_3D_to_4D(phi, 0, 0, xx,xx,xx,xx) dadi.cuda_enabled(False) phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, dadi.cuda_enabled(True) phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24, m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43, gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4, h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0) dadi.cuda_enabled(True) phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=0.1, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0, frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4) dadi.cuda_enabled(False) phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24, m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43, gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4, h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0) h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0, frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4) self.assertTrue(np.allclose(phi_cpu, phi_gpu)) m12, m13, m14, m21, m23, m24, m31, m32, m34, m41, m42, m43 = [0]*12 for f1 in [True, False]: for f2 in [True, False]: for f3 in [True, False]: for f4 in [True, False]: dadi.cuda_enabled(True) phi_gpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24, m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43, gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4, h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0, frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4) dadi.cuda_enabled(False) phi_cpu = dadi.Integration.four_pops(phi.copy(), xx, T=T, nu1=nu1, nu2=nu2, nu3=nu3, nu4=nu4, m12=m12, m13=m13, m14=m14, m21=m21, m23=m23, m24=m24, m31=m31, m32=m32, m34=m34, m41=m41, m42=m42, m43=m43, gamma1=gamma1, gamma2=gamma2, gamma3=gamma3, gamma4=gamma4, h1=h1, h2=h2, h3=h3, h4=h4, theta0=theta0, frozen1=f1, frozen2=f2, frozen3=f3, frozen4=f4) if dadi.cuda_enabled(True): suite = unittest.TestLoader().loadTestsFromTestCase(CUDATestCase) else: ... ...
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment