Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
mmassaviol
WAWmassaviol
Commits
053c3ad7
Commit
053c3ad7
authored
Sep 02, 2021
by
khalid
Browse files
Add W&C pairwise Fst calculation
parent
9be48b96
Changes
6
Hide whitespace changes
Inline
Side-by-side
tools/Fst_gds/Fst_gds.rule.snakemake
0 → 100755
View file @
053c3ad7
<step_name>__Fst_gds_gds_name = os.path.basename(<step_name>__Fst_gds_inputs()["gds"]).split(".")[0]
rule <step_name>__Fst_gds:
input:
**<step_name>__Fst_gds_inputs(),
output:
fst = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/" + <step_name>__Fst_gds_gds_name + "_fst.txt",
meanfst = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/" + <step_name>__Fst_gds_gds_name + "_meanfst.txt",
NJ_png = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_NJ_plot_mqc.png",
Fst_heatmap_png = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_heatmap_mqc.png",
params:
output_dir = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"]+ "/",
command = config["<step_name>__Fst_gds_command"],
threads:
config["<step_name>__Fst_gds_threads"]
log:
config["results_dir"] + "/logs/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_gds_log.txt"
script:
config["<step_name>__Fst_gds_script"]
tools/Fst_gds/Fst_gds.script.R
0 → 100644
View file @
053c3ad7
library
(
SNPRelate
);
library
(
SeqArray
);
library
(
ape
)
library
(
ggtree
)
popmap_file
=
snakemake
@
input
[[
"popmap_file"
]]
gdsFile
=
snakemake
@
input
[[
"gds"
]]
NJ_png
=
snakemake
@
output
[[
"NJ_png"
]]
Fst_heatmap_png
=
snakemake
@
output
[[
"Fst_heatmap_png"
]]
fstFile
=
snakemake
@
output
[[
"fst"
]]
meanfstFile
=
snakemake
@
output
[[
"meanfst"
]]
threads
=
snakemake
@
threads
gds
=
seqOpen
(
gdsFile
);
nbpop
=
1
if
(
popmap_file
!=
''
)
{
pop_map
=
read.table
(
popmap_file
,
header
=
F
,
sep
=
'\t'
,
stringsAsFactors
=
FALSE
)
;
colnames
(
pop_map
)[
c
(
1
,
2
)]
=
c
(
'sample.id'
,
'popcode'
)
;
# Get sample id
sample.id
<-
seqGetData
(
gds
,
"sample.id"
)
pops
=
pop_map
$
popcode
[
match
(
sample.id
,
pop_map
$
sample.id
)]
tab
<-
data.frame
(
sample.id
,
popcode
=
pops
,
stringsAsFactors
=
FALSE
);
Allpops
=
unique
(
tab
$
popcode
)
nbpops
=
length
(
Allpops
)
if
(
nbpops
>
1
)
{
fstMatrix
=
matrix
(
0
,
nbpops
,
nbpops
)
meanfstMatrix
=
matrix
(
0
,
nbpops
,
nbpops
)
for
(
i
in
seq
(
1
,
nbpops
-1
)
)
{
# Two populations:
for
(
j
in
seq
(
i
+1
,
nbpops
)
)
{
selected
<-
tab
[
tab
$
popcode
%in%
c
(
Allpops
[
i
],
Allpops
[
j
]),]
v
<-
snpgdsFst
(
gds
,
sample.id
=
selected
$
sample.id
,
population
=
as.factor
(
selected
$
popcode
),
method
=
"W&C84"
,
autosome.only
=
F
)
fstMatrix
[
i
,
j
]
=
v
$
Fst
fstMatrix
[
j
,
i
]
=
v
$
Fst
meanfstMatrix
[
i
,
j
]
=
v
$
MeanFst
meanfstMatrix
[
j
,
i
]
=
v
$
MeanFst
}
}
colnames
(
meanfstMatrix
)
=
Allpops
;
rownames
(
meanfstMatrix
)
=
Allpops
colnames
(
fstMatrix
)
=
Allpops
;
rownames
(
fstMatrix
)
=
Allpops
write.table
(
meanfstMatrix
,
file
=
meanfstFile
,
append
=
FALSE
,
quote
=
F
,
sep
=
'\t'
,
row.names
=
T
,
col.names
=
NA
);
write.table
(
fstMatrix
,
file
=
fstFile
,
append
=
FALSE
,
quote
=
F
,
sep
=
'\t'
,
row.names
=
T
,
col.names
=
NA
);
png
(
file
=
Fst_heatmap_png
,
width
=
1024
,
height
=
1024
);
heatmap
(
fstMatrix
)
;
dev.off
();
png
(
file
=
NJ_png
,
width
=
1024
,
height
=
1024
);
forme
=
"rectangular"
title
=
paste0
(
"NJ plot on pairewise Fst."
)
distance
=
fstMatrix
tree
=
nj
(
distance
)
p
<-
ggtree
(
tree
,
layout
=
forme
)
+
geom_text
(
aes
(
label
=
label
),
size
=
3
,
color
=
"purple"
,
hjust
=
-0.3
)
+
theme_tree2
(
"gray86"
)
+
ggtitle
(
title
)
plot
(
p
)
dev.off
()
}
else
{
# need more than 1 pop
}
}
else
{
# need popmap
}
seqClose
(
gds
);
tools/Fst_gds/Fst_gds.yaml
0 → 100755
View file @
053c3ad7
{
id
:
Fst_gds
,
name
:
Principal Component Analysis
,
article
:
"
"
,
website
:
"
"
,
git
:
"
"
,
description
:
"
Principal
Component
Analysis
(PCA)
on
genotypes
with
SNPRelate
R
package"
,
version
:
"
"
,
documentation
:
"
"
,
multiqc
:
"
custom"
,
commands
:
[
{
name
:
Fst_gds
,
command
:
"
"
,
category
:
"
genet_pop"
,
output_dir
:
Fst_gds
,
inputs
:
[{
name
:
gds
,
type
:
"
gds"
,
file
:
"
"
,
description
:
"
gds
file"
}
,
{
name
:
popmap_file
,
type
:
"
popmap"
,
file
:
"
"
,
description
:
"
Path
to
tsv
file
with
samples
group"
}
],
outputs
:
[
{
name
:
fst
,
type
:
"
txt"
,
file
:
"
*_fst.txt"
,
description
:
"
Weir
and
Cockerham
weighted
Fst
estimate
for
each
pop
pair"
},
{
name
:
meanfst
,
type
:
"
txt"
,
file
:
"
*_meanfst.txt"
,
description
:
"
Weir
and
Cockerham
mean
Fst
estimate
for
each
pop
pair"
},
{
name
:
NJ_png
,
type
:
"
png"
,
file
:
"
Fst_NJ_plot_mqc.png"
,
description
:
"
NJ
tree
on
pop
pairwise
Fst"
},
{
name
:
Fst_heatmap_png
,
type
:
"
png"
,
file
:
"
Fst_heatmap_mqc.png"
,
description
:
"
NJ
tree
on
pop
pairwise
Fst"
},
],
options
:
[
{
name
:
"
Fst_gds_threads"
,
prefix
:
"
--threads"
,
value
:
16
,
min
:
1
,
max
:
64
,
step
:
1
,
label
:
"
Threads
to
use"
,
type
:
"
numeric"
,
},
],
},
],
script
:
Fst_gds.script.R
,
install
:
{
SNPRelate
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SNPRelate",
update
=
TRUE,
ask
=
FALSE)'
'
'
],
SeqArray
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SeqArray",
update
=
TRUE,
ask
=
FALSE)'
'
'
],
ape
:
[
'
Rscript
-e
'
'
install.packages("ape")'
'
'
],
ggtree
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("ggtree",
update
=
TRUE,
ask
=
FALSE)'
'
'
]
},
citations
:
{
SNPRelate
:
[
"
Zheng
X,
Levine
D,
Shen
J,
Gogarten
S,
Laurie
C,
Weir
B
(2012).
A
High-performance
Computing
Toolset
for
Relatedness
and
Principal
Component
Analysis
of
SNP
Data.
Bioinformatics,
28(24),
3326-3328.
doi:
10.1093/bioinformatics/bts606.
"
],
SeqArray
:
[
"
Zheng
X,
Gogarten
S,
Lawrence
M,
Stilp
A,
Conomos
M,
Weir
B,
Laurie
C,
Levine
D
(2017).
SeqArray
-
A
storage-efficient
high-performance
data
format
for
WGS
variant
calls.
Bioinformatics.
doi:
10.1093/bioinformatics/btx145."
],
ape
:
[
"
Paradis
E,
Schliep
K
(2019).
ape
5.0:
an
environment
for
modern
phylogenetics
and
evolutionary
analyses
in
R.
Bioinformatics,
35,
526-528."
],
ggtree
:
[
"
Yu
G
(2020).
Using
ggtree
to
Visualize
Data
on
Tree-Like
Structures.
Current
Protocols
in
Bioinformatics,
69(1),
e96.
doi:
10.1002/cpbi.96,
https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.96."
]
},
}
tools/pca_gds/pca_gds.yaml
View file @
053c3ad7
...
...
@@ -45,12 +45,19 @@
SNPRelate
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SNPRelate",
update
=
TRUE,
ask
=
FALSE)'
'
'
]
],
SeqArray
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SeqArray",
update
=
TRUE,
ask
=
FALSE)'
'
'
],
},
citations
:
{
SNPRelate
:
[
"
Zheng
X,
Levine
D,
Shen
J,
Gogarten
S,
Laurie
C,
Weir
B
(2012).
A
High-performance
Computing
Toolset
for
Relatedness
and
Principal
Component
Analysis
of
SNP
Data.
Bioinformatics,
28(24),
3326-3328.
doi:
10.1093/bioinformatics/bts606.
"
]
],
SeqArray
:
[
"
Zheng
X,
Gogarten
S,
Lawrence
M,
Stilp
A,
Conomos
M,
Weir
B,
Laurie
C,
Levine
D
(2017).
SeqArray
-
A
storage-efficient
high-performance
data
format
for
WGS
variant
calls.
Bioinformatics.
doi:
10.1093/bioinformatics/btx145."
],
},
}
tools/vcf2gds/vcf2gds.script.R
View file @
053c3ad7
...
...
@@ -12,14 +12,14 @@ if (ldprune == FALSE)
seqVCF2GDS
(
vcffile
,
gdsfile
,
verbose
=
FALSE
);
}
else
{
ld.threshold
=
snakemake
@
params
[[
"vcf2gds_ld_threshold"
]]
maf
=
snakemake
@
params
[[
"vcf2gds_min_maf"
]]
tmpgds
=
tempfile
(
pattern
=
"tmp"
,
fileext
=
"gds"
)
ld.threshold
=
as.double
(
snakemake
@
params
[[
"vcf2gds_ld_threshold"
]]
)
maf
=
as.double
(
snakemake
@
params
[[
"vcf2gds_min_maf"
]]
)
tmpgds
=
tempfile
(
pattern
=
"tmp"
,
fileext
=
"gds"
)
seqVCF2GDS
(
vcffile
,
tmpgds
,
verbose
=
FALSE
);
gds
=
seqOpen
(
tmpgds
)
#get a list of SNP not in LD
snpset
<-
snpgdsLDpruning
(
gds
,
ld.threshold
=
ld.threshold
,
maf
=
maf
)
snpset
<-
snpgdsLDpruning
(
gds
,
ld.threshold
=
ld.threshold
,
maf
=
maf
,
autosome.only
=
FALSE
,
remove.monosnp
=
FALSE
)
snpset.id
<-
unlist
(
snpset
,
use.names
=
FALSE
)
#apply a filter
...
...
tools/vcf2gds/vcf2gds.yaml
View file @
053c3ad7
...
...
@@ -33,7 +33,7 @@
min
:
0
,
max
:
0.5
,
step
:
0.01
,
label
:
"
use
the
SNPs
with
>=
maf
only"
,
label
:
"
if
LD
pruning
:
use
the
SNPs
with
>=
maf
only"
,
},
{
name
:
vcf2gds_ld_threshold
,
...
...
@@ -43,7 +43,7 @@
min
:
0
,
max
:
1
,
step
:
0.01
,
label
:
"
skip
SNPs
with
ld
>
to
this
threshold"
,
label
:
"
if
LD
pruning
:
skip
SNPs
with
ld
>
to
this
threshold"
,
},
{
name
:
"
vcf2gds_threads"
,
...
...
@@ -64,12 +64,20 @@
SeqArray
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SeqArray",
update
=
TRUE,
ask
=
FALSE)'
'
'
],
SNPRelate
:
[
'
Rscript
-e
'
'
if
(!requireNamespace("BiocManager",
quietly
=
TRUE))
install.packages("BiocManager");
BiocManager::install("SNPRelate",
update
=
TRUE,
ask
=
FALSE)'
'
'
]
},
citations
:
{
SeqArray
:
[
"
Zheng
X,
Gogarten
S,
Lawrence
M,
Stilp
A,
Conomos
M,
Weir
B,
Laurie
C,
Levine
D
(2017).
SeqArray
-
A
storage-efficient
high-performance
data
format
for
WGS
variant
calls.
Bioinformatics.
doi:
10.1093/bioinformatics/btx145."
]
],
SNPRelate
:
[
"
Zheng
X,
Levine
D,
Shen
J,
Gogarten
S,
Laurie
C,
Weir
B
(2012).
A
High-performance
Computing
Toolset
for
Relatedness
and
Principal
Component
Analysis
of
SNP
Data.
Bioinformatics,
28(24),
3326-3328.
doi:
10.1093/bioinformatics/bts606.
"
]
},
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment