Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
khalid
Dadi
Commits
e5c0317c
Commit
e5c0317c
authored
Jul 15, 2020
by
Ryan Gutenkunst
Browse files
Code for CUDA from_phi_2D_linalg
parent
549a9783
Changes
3
Hide whitespace changes
Inline
Side-by-side
dadi/cuda/FromPhi.py
View file @
e5c0317c
import
numpy
import
numpy
as
np
from
scipy.special
import
betainc
...
...
@@ -30,70 +31,179 @@ def drop_last_col_2D(input):
grid
=
_grid
(
L
*
M
),
block
=
_block
())
return
output
from
dadi.Spectrum_mod
import
cached_dbeta
#def _from_phi_2D_cuda(nx, ny, xx, yy, phi):
# L,M = phi.shape
#
# # Test for xx == yy, since we can make that assumption in the code
#
# # Can I just leave these on the GPU, to save transfer costs?
# dbeta1_xx, dbeta2_xx = cached_dbeta(nx, xx)
# dbeta1_yy, dbeta2_yy = cached_dbeta(ny, xx)
#
# # Can I just leave these on the GPU in between evaluations?
# dbeta1_xx_gpu = gpuarray.to_gpu(dbeta1_xx)
# dbeta2_xx_gpu = gpuarray.to_gpu(dbeta2_xx)
# dbeta1_yy_gpu = gpuarray.to_gpu(dbeta1_yy)
# dbeta2_yy_gpu = gpuarray.to_gpu(dbeta2_yy)
#
# phi_gpu = gpuarray.to_gpu(phi)
#
# xx_gpu = yy_gpu = gpuarray.to_gpu(xx)
# dy_gpu = dx_gpu = misc.subtract(xx_gpu[1:], xx_gpu[:-1])
#
# phi_dy_gpu = diff_iny_2D(phi_gpu)
# s_yy_gpu = misc.divide(phi_dy_gpu, dy_gpu)
#
# c1_yy_gpu = drop_last_col_2D(phi_gpu) - misc.multiply(s_yy_gpu, yy_gpu[:-1])
# c1_yy_gpu /= ny+1
#
# term1_yy_gpu = linalg.dot(dbeta1_yy_gpu, c1_yy_gpu, transb='T')
#
# term2_yy_gpu = linalg.dot(dbeta2_yy_gpu, s_yy_gpu, transb='T')
# _ = gpuarray.to_gpu(np.arange(1, ny+2).reshape(ny+1,1)/((ny+1)*(ny+2)))
# term2_yy_gpu = misc.multiply(term2_yy_gpu, _)
#
# over_y_all_gpu = term1_yy_gpu + term2_yy_gpu
#
# _ = diff_iny_2D(over_y_all_gpu)
# s_xx_all_gpu = misc.divide(_, dx_gpu)
#
# _2 = misc.multiply(s_xx_all_gpu, xx_gpu[:-1])
# c1_xx_all_gpu = (drop_last_col_2D(over_y_all_gpu) - _2)/(nx+1.)
#
# term1_all_gpu = linalg.dot(dbeta1_xx_gpu, c1_xx_all_gpu, transb='T')
# term2_all_gpu = linalg.dot(dbeta2_xx_gpu, s_xx_all_gpu, transb='T')
#
# _ = gpuarray.to_gpu(np.arange(1, nx+2).reshape(nx+1,1)/((nx+1)*(nx+2)))
# _ = misc.multiply(term2_all_gpu, _)
# data = term1_all_gpu + _
#
# return data.get()
_cache
=
{
'key'
:
None
}
def
_from_phi_2D_cuda
(
nx
,
ny
,
xx
,
yy
,
phi
):
L
,
M
=
phi
.
shape
xx
=
np
.
minimum
(
np
.
maximum
(
xx
,
0
),
1.0
)
yy
=
np
.
minimum
(
np
.
maximum
(
yy
,
0
),
1.0
)
key
=
nx
,
tuple
(
xx
)
if
key
not
in
_dbeta_cache
:
dbeta1_xx
=
np
.
empty
((
nx
+
1
,
L
-
1
))
dbeta2_xx
=
np
.
empty
((
nx
+
1
,
L
-
1
))
for
ii
in
range
(
0
,
nx
+
1
):
b
=
betainc
(
ii
+
1
,
nx
-
ii
+
1
,
xx
)
dbeta1_xx
[
ii
]
=
b
[
1
:]
-
b
[:
-
1
]
b
=
betainc
(
ii
+
2
,
nx
-
ii
+
1
,
xx
)
dbeta2_xx
[
ii
]
=
b
[
1
:]
-
b
[:
-
1
]
_dbeta_cache
[
key
]
=
dbeta1_xx
,
dbeta2_xx
dbeta1_xx
,
dbeta2_xx
=
_dbeta_cache
[
key
]
dbeta1_xx_gpu
=
gpuarray
.
to_gpu
(
dbeta1_xx
)
dbeta2_xx_gpu
=
gpuarray
.
to_gpu
(
dbeta2_xx
)
key
=
ny
,
tuple
(
yy
)
if
key
not
in
_dbeta_cache
:
dbeta1_yy
=
np
.
empty
((
ny
+
1
,
M
-
1
))
dbeta2_yy
=
np
.
empty
((
ny
+
1
,
M
-
1
))
for
ii
in
range
(
0
,
ny
+
1
):
b
=
betainc
(
ii
+
1
,
ny
-
ii
+
1
,
yy
)
dbeta1_yy
[
ii
]
=
b
[
1
:]
-
b
[:
-
1
]
b
=
betainc
(
ii
+
2
,
ny
-
ii
+
1
,
yy
)
dbeta2_yy
[
ii
]
=
b
[
1
:]
-
b
[:
-
1
]
_dbeta_cache
[
key
]
=
dbeta1_yy
,
dbeta2_yy
dbeta1_yy
,
dbeta2_yy
=
_dbeta_cache
[
key
]
dbeta1_yy_gpu
=
gpuarray
.
to_gpu
(
dbeta1_yy
)
dbeta2_yy_gpu
=
gpuarray
.
to_gpu
(
dbeta2_yy
)
phi_gpu
=
gpuarray
.
to_gpu
(
phi
)
xx_gpu
,
yy_gpu
=
gpuarray
.
to_gpu
(
xx
),
gpuarray
.
to_gpu
(
yy
)
dx_gpu
=
misc
.
subtract
(
xx_gpu
[
1
:],
xx_gpu
[:
-
1
])
dy_gpu
=
misc
.
subtract
(
yy_gpu
[
1
:],
yy_gpu
[:
-
1
])
pts
,
nx
,
ny
=
np
.
int32
(
phi
.
shape
[
0
]),
np
.
int32
(
nx
),
np
.
int32
(
ny
)
if
(
nx
,
ny
)
!=
_cache
[
'key'
]
or
not
np
.
all
(
_cache
[
'xx'
]
==
xx
):
dbeta1_xx
,
dbeta2_xx
=
cached_dbeta
(
nx
,
xx
)
dbeta1_yy
,
dbeta2_yy
=
cached_dbeta
(
ny
,
xx
)
# Can I leave these on the GPU in between evaluations?
# Typical use case is many evaluations with constant xx, nx, ny
# That will work even at higher dimensionality
dbeta1_xx_gpu
=
gpuarray
.
to_gpu
(
dbeta1_xx
)
dbeta2_xx_gpu
=
gpuarray
.
to_gpu
(
dbeta2_xx
)
dbeta1_yy_gpu
=
gpuarray
.
to_gpu
(
dbeta1_yy
)
dbeta2_yy_gpu
=
gpuarray
.
to_gpu
(
dbeta2_yy
)
xx_gpu
=
yy_gpu
=
gpuarray
.
to_gpu
(
xx
)
s_yy_gpu
=
gpuarray
.
empty
((
pts
,
pts
-
1
),
dtype
=
np
.
float64
)
c1_yy_gpu
=
gpuarray
.
empty
((
pts
,
pts
-
1
),
dtype
=
np
.
float64
)
s_xx_all_gpu
=
gpuarray
.
empty
((
ny
+
1
,
pts
-
1
),
dtype
=
np
.
float64
)
c1_xx_all_gpu
=
gpuarray
.
empty
((
ny
+
1
,
pts
-
1
),
dtype
=
np
.
float64
)
_cache
[
'mem'
]
=
dbeta1_xx_gpu
,
dbeta2_xx_gpu
,
dbeta1_yy_gpu
,
dbeta2_yy_gpu
,
xx_gpu
,
xx_gpu
,
s_yy_gpu
,
c1_yy_gpu
,
s_xx_all_gpu
,
c1_xx_all_gpu
_cache
[
'key'
]
=
(
nx
,
ny
)
_cache
[
'xx'
]
=
xx
dbeta1_xx_gpu
,
dbeta2_xx_gpu
,
dbeta1_yy_gpu
,
dbeta2_yy_gpu
,
xx_gpu
,
yy_gpu
,
s_yy_gpu
,
c1_yy_gpu
,
s_xx_all_gpu
,
c1_xx_all_gpu
=
_cache
[
'mem'
]
phi_
dy_
gpu
=
diff_iny_2D
(
phi_gpu
)
phi_gpu
=
gpuarray
.
to_gpu
(
phi
)
s_yy_gpu
=
misc
.
divide
(
phi_dy_gpu
,
dy_gpu
)
c1_yy_gpu
=
drop_last_col_2D
(
phi_gpu
)
-
misc
.
multiply
(
s_yy_gpu
,
yy_gpu
[:
-
1
])
c1_yy_gpu
/=
ny
+
1
kernels
.
calc_s
(
phi_gpu
,
yy_gpu
,
pts
,
pts
,
s_yy_gpu
,
grid
=
_grid
(
pts
*
(
pts
-
1
)),
block
=
_block
())
kernels
.
calc_c1
(
phi_gpu
,
yy_gpu
,
s_yy_gpu
,
ny
,
pts
,
pts
,
c1_yy_gpu
,
grid
=
_grid
(
pts
*
(
pts
-
1
)),
block
=
_block
())
term1_yy_gpu
=
linalg
.
dot
(
dbeta1_yy_gpu
,
c1_yy_gpu
,
transb
=
'T'
)
term2_yy_gpu
=
linalg
.
dot
(
dbeta2_yy_gpu
,
s_yy_gpu
,
transb
=
'T'
)
_
=
gpuarray
.
to_gpu
(
np
.
arange
(
1
,
ny
+
2
).
reshape
(
ny
+
1
,
1
)
/
((
ny
+
1
)
*
(
ny
+
2
)))
term2_yy_gpu
=
misc
.
multiply
(
term2_yy_gpu
,
_
)
over_y_all_gpu
=
term1_yy_gpu
+
term2_yy_gpu
_
=
diff_iny_2D
(
over_y_all_gpu
)
s_xx_all_gpu
=
misc
.
divide
(
_
,
dx_gpu
)
# To save a memory allocation, modify term1_yy_gpu in place to hold over_y_all_gpu
kernels
.
combine_terms
(
term1_yy_gpu
,
term2_yy_gpu
,
ny
,
np
.
int32
(
ny
+
1
),
pts
,
term1_yy_gpu
,
grid
=
_grid
((
ny
+
1
)
*
pts
),
block
=
_block
())
over_y_all_gpu
=
term1_yy_gpu
_2
=
misc
.
multiply
(
s_xx_all_gpu
,
xx_gpu
[:
-
1
])
c1_xx_all_gpu
=
(
drop_last_col_2D
(
over_y_all_gpu
)
-
_2
)
/
(
nx
+
1.
)
kernels
.
calc_s
(
over_y_all_gpu
,
xx_gpu
,
np
.
int32
(
ny
+
1
),
pts
,
s_xx_all_gpu
,
grid
=
_grid
((
pts
-
1
)
*
(
ny
+
1
)),
block
=
_block
())
kernels
.
calc_c1
(
over_y_all_gpu
,
xx_gpu
,
s_xx_all_gpu
,
nx
,
np
.
int32
(
ny
+
1
),
pts
,
c1_xx_all_gpu
,
grid
=
_grid
((
pts
-
1
)
*
(
ny
+
1
)),
block
=
_block
())
term1_all_gpu
=
linalg
.
dot
(
dbeta1_xx_gpu
,
c1_xx_all_gpu
,
transb
=
'T'
)
term2_all_gpu
=
linalg
.
dot
(
dbeta2_xx_gpu
,
s_xx_all_gpu
,
transb
=
'T'
)
_
=
gpuarray
.
to_gpu
(
np
.
arange
(
1
,
nx
+
2
).
reshape
(
nx
+
1
,
1
)
/
((
nx
+
1
)
*
(
nx
+
2
)))
_
=
misc
.
multiply
(
term2_all_gpu
,
_
)
data
=
term1_all_gpu
+
_
return
data
.
get
()
\ No newline at end of file
# To save a memory allocation, modify term1_all_gpu in place to hold data
kernels
.
combine_terms
(
term1_all_gpu
,
term2_all_gpu
,
nx
,
np
.
int32
(
nx
+
1
),
np
.
int32
(
ny
+
1
),
term1_all_gpu
,
grid
=
_grid
((
nx
+
1
)
*
(
ny
+
1
)),
block
=
_block
())
gpu
=
term1_all_gpu
.
get
()
return
gpu
def
_from_phi_3D_cuda
(
nx
,
ny
,
nz
,
xx
,
yy
,
zz
,
phi
,
mask_corners
=
True
,
raw
=
False
):
data
=
numpy
.
zeros
((
nx
+
1
,
ny
+
1
,
nz
+
1
))
dbeta1_zz
,
dbeta2_zz
=
cached_dbeta
(
nz
,
zz
)
# Quick testing suggests that doing the x direction first for better
# memory alignment isn't worth much.
s_zz
=
(
phi
[:,:,
1
:]
-
phi
[:,:,:
-
1
])
/
(
zz
[
nuax
,
nuax
,
1
:]
-
zz
[
nuax
,
nuax
,:
-
1
])
c1_zz
=
(
phi
[:,:,:
-
1
]
-
s_zz
*
zz
[
nuax
,
nuax
,:
-
1
])
/
(
nz
+
1
)
# These calculations can be done without this for loop, but the
# four-dimensional intermediate results consume massive amounts of RAM,
# which makes the for loop faster for large systems.
for
kk
in
range
(
0
,
nz
+
1
):
# In testing, these two np.dot lines occupy 2/3 the time, so further
# speedup will be difficult
term1
=
np
.
dot
(
c1_zz
,
dbeta1_zz
[
kk
])
term2
=
np
.
dot
(
s_zz
,
dbeta2_zz
[
kk
])
term2
*=
(
kk
+
1
)
/
((
nz
+
1
)
*
(
nz
+
2
))
over_z
=
term1
+
term2
sub_fs
=
_from_phi_2D_cuda
(
nx
,
ny
,
xx
,
yy
,
over_z
)
data
[:,:,
kk
]
=
sub_fs
if
raw
:
return
data
else
:
return
dadi
.
Spectrum
(
data
,
mask_corners
=
mask_corners
)
import
pycuda
import
dadi
import
numpy
as
np
from
numpy
import
newaxis
as
nuax
import
time
if
__name__
==
"__main__"
:
print
(
'2D test'
)
pts
=
400
nx
,
ny
=
20
,
20
phi
=
np
.
random
.
uniform
(
size
=
(
pts
,
pts
))
xx
=
np
.
linspace
(
0
,
1
,
pts
)
start
=
time
.
time
()
cpu
=
dadi
.
Spectrum
.
_from_phi_2D_linalg
(
nx
,
ny
,
xx
,
xx
,
phi
,
raw
=
True
)
print
(
"CPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
cpu
=
dadi
.
Spectrum
.
_from_phi_2D_linalg
(
nx
,
ny
,
xx
,
xx
,
phi
,
raw
=
True
)
print
(
"CPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
gpu
=
_from_phi_2D_cuda
(
nx
,
ny
,
xx
,
xx
,
phi
)
print
(
"GPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
gpu
=
_from_phi_2D_cuda
(
nx
,
ny
,
xx
,
xx
,
phi
)
print
(
"GPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
print
(
'3D test'
)
pts
=
400
nx
,
ny
,
nz
=
20
,
20
,
20
phi
=
np
.
random
.
uniform
(
size
=
(
pts
,
pts
,
pts
))
xx
=
np
.
linspace
(
0
,
1
,
pts
)
start
=
time
.
time
()
cpu
=
dadi
.
Spectrum
.
_from_phi_3D_linalg
(
nx
,
ny
,
nz
,
xx
,
xx
,
xx
,
phi
,
raw
=
True
)
print
(
"CPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
cpu
=
dadi
.
Spectrum
.
_from_phi_3D_linalg
(
nx
,
ny
,
nz
,
xx
,
xx
,
xx
,
phi
,
raw
=
True
)
print
(
"CPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
gpu
=
_from_phi_3D_cuda
(
nx
,
ny
,
nz
,
xx
,
xx
,
xx
,
phi
)
print
(
"GPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
start
=
time
.
time
()
gpu
=
_from_phi_3D_cuda
(
nx
,
ny
,
nz
,
xx
,
xx
,
xx
,
phi
)
print
(
"GPU: {0:.3f}"
.
format
(
time
.
time
()
-
start
))
\ No newline at end of file
dadi/cuda/kernels.cu
View file @
e5c0317c
...
...
@@ -144,3 +144,50 @@ __global__ void compute_bc_nobc(double *dx, double *dfactor,
c
[
ii
*
M
+
jj
]
=
-
dfactor
[
ii
]
*
ctemp
;
}
}
__global__
void
diff_iny_2D
(
double
*
in
,
double
*
out
,
int
L
,
int
M
){
int
ii
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
M
-
1
);
int
jj
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
M
-
1
);
if
(
ii
<
L
){
out
[
ii
*
(
M
-
1
)
+
jj
]
=
in
[
ii
*
M
+
jj
+
1
]
-
in
[
ii
*
M
+
jj
];
}
}
__global__
void
drop_last_col_2D
(
double
*
in
,
double
*
out
,
int
L
,
int
M
){
int
ii
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
M
-
1
);
int
jj
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
M
-
1
);
if
(
ii
<
L
){
out
[
ii
*
(
M
-
1
)
+
jj
]
=
in
[
ii
*
M
+
jj
];
}
}
__global__
void
calc_s
(
double
*
phi
,
double
*
yy
,
int
L
,
int
M
,
double
*
s
){
int
ii
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
M
-
1
);
int
jj
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
M
-
1
);
if
(
ii
<
L
){
s
[
ii
*
(
M
-
1
)
+
jj
]
=
(
phi
[
ii
*
M
+
jj
+
1
]
-
phi
[
ii
*
M
+
jj
])
/
(
yy
[
jj
+
1
]
-
yy
[
jj
]);
}
}
__global__
void
calc_c1
(
double
*
phi
,
double
*
yy
,
double
*
s_yy
,
int
ny
,
int
L
,
int
M
,
double
*
s
){
//c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1)
int
ii
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
(
M
-
1
);
int
jj
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
%
(
M
-
1
);
if
(
ii
<
L
){
s
[
ii
*
(
M
-
1
)
+
jj
]
=
(
phi
[
ii
*
M
+
jj
]
-
s_yy
[
ii
*
(
M
-
1
)
+
jj
]
*
yy
[
jj
])
/
(
ny
+
1.
);
}
}
__global__
void
combine_terms
(
double
*
term1
,
double
*
term2
,
int
n
,
int
L
,
int
M
,
double
*
out
){
//c1_yy = (phi[:,:-1] - s_yy*yy[nuax,:-1])/(ny+1)
int
ii
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
/
M
;
int
jj
=
(
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
)
%
M
;
if
(
ii
<
L
){
out
[
ii
*
M
+
jj
]
=
term1
[
ii
*
M
+
jj
]
+
term2
[
ii
*
M
+
jj
]
*
(
ii
+
1.
)
/
((
n
+
1
)
*
(
n
+
2
));
}
}
\ No newline at end of file
dadi/cuda/kernels.py
View file @
e5c0317c
...
...
@@ -17,4 +17,10 @@ _Mfunc5D = mod.get_function("Mfunc5D")
_cx0
=
mod
.
get_function
(
"cx0"
)
_compute_ab_nobc
=
mod
.
get_function
(
"compute_ab_nobc"
)
_compute_bc_nobc
=
mod
.
get_function
(
"compute_bc_nobc"
)
_include_bc
=
mod
.
get_function
(
"include_bc"
)
\ No newline at end of file
_include_bc
=
mod
.
get_function
(
"include_bc"
)
diff_iny_2D
=
mod
.
get_function
(
"diff_iny_2D"
)
drop_last_col_2D
=
mod
.
get_function
(
"drop_last_col_2D"
)
calc_s
=
mod
.
get_function
(
'calc_s'
)
calc_c1
=
mod
.
get_function
(
'calc_c1'
)
combine_terms
=
mod
.
get_function
(
'combine_terms'
)
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment