Commit 053c3ad7 authored by khalid's avatar khalid
Browse files

Add W&C pairwise Fst calculation

parent 9be48b96
<step_name>__Fst_gds_gds_name = os.path.basename(<step_name>__Fst_gds_inputs()["gds"]).split(".")[0]
rule <step_name>__Fst_gds:
input:
**<step_name>__Fst_gds_inputs(),
output:
fst = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/" + <step_name>__Fst_gds_gds_name + "_fst.txt",
meanfst = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/" + <step_name>__Fst_gds_gds_name + "_meanfst.txt",
NJ_png = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_NJ_plot_mqc.png",
Fst_heatmap_png = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_heatmap_mqc.png",
params:
output_dir = config["results_dir"] + "/" + config["<step_name>__Fst_gds_output_dir"]+ "/",
command = config["<step_name>__Fst_gds_command"],
threads:
config["<step_name>__Fst_gds_threads"]
log:
config["results_dir"] + "/logs/" + config["<step_name>__Fst_gds_output_dir"] + "/Fst_gds_log.txt"
script:
config["<step_name>__Fst_gds_script"]
library(SNPRelate);
library(SeqArray);
library(ape)
library(ggtree)
popmap_file = snakemake@input[["popmap_file"]]
gdsFile = snakemake@input[["gds"]]
NJ_png = snakemake@output[["NJ_png"]]
Fst_heatmap_png = snakemake@output[["Fst_heatmap_png"]]
fstFile = snakemake@output[["fst"]]
meanfstFile = snakemake@output[["meanfst"]]
threads = snakemake@threads
gds = seqOpen(gdsFile);
nbpop = 1
if (popmap_file != '')
{
pop_map= read.table(popmap_file, header=F, sep='\t',stringsAsFactors = FALSE) ;
colnames(pop_map)[c(1,2)]=c('sample.id', 'popcode') ;
# Get sample id
sample.id <- seqGetData(gds, "sample.id")
pops = pop_map$popcode[match(sample.id, pop_map$sample.id)]
tab <- data.frame(sample.id, popcode = pops, stringsAsFactors = FALSE);
Allpops = unique(tab$popcode)
nbpops = length(Allpops)
if (nbpops > 1)
{
fstMatrix = matrix(0, nbpops, nbpops)
meanfstMatrix = matrix(0, nbpops, nbpops)
for (i in seq(1,nbpops-1) )
{
# Two populations:
for (j in seq(i+1, nbpops) )
{
selected <- tab[tab$popcode %in% c(Allpops[i], Allpops[j]),]
v <- snpgdsFst(gds, sample.id=selected$sample.id, population=as.factor(selected$popcode), method="W&C84", autosome.only =F)
fstMatrix[i,j] = v$Fst
fstMatrix[j,i] = v$Fst
meanfstMatrix[i,j] = v$MeanFst
meanfstMatrix[j,i] = v$MeanFst
}
}
colnames(meanfstMatrix) = Allpops; rownames(meanfstMatrix) = Allpops
colnames(fstMatrix) = Allpops; rownames(fstMatrix) = Allpops
write.table(meanfstMatrix, file = meanfstFile, append = FALSE, quote = F, sep = '\t', row.names = T, col.names = NA);
write.table(fstMatrix, file = fstFile, append = FALSE, quote = F, sep = '\t', row.names = T, col.names = NA);
png(file=Fst_heatmap_png, width=1024, height=1024);
heatmap(fstMatrix) ;
dev.off();
png(file=NJ_png, width=1024, height=1024);
forme = "rectangular"
title=paste0("NJ plot on pairewise Fst.")
distance = fstMatrix
tree = nj(distance)
p <- ggtree(tree,layout=forme) + geom_text(aes(label=label), size=3, color="purple", hjust=-0.3) + theme_tree2("gray86") +ggtitle(title)
plot(p)
dev.off()
}
else {
# need more than 1 pop
}
}
else{
# need popmap
}
seqClose(gds);
{
id: Fst_gds,
name: Principal Component Analysis,
article: "",
website: "",
git: "",
description: "Principal Component Analysis (PCA) on genotypes with SNPRelate R package",
version: "",
documentation: "",
multiqc: "custom",
commands:
[
{
name: Fst_gds,
command: "",
category: "genet_pop",
output_dir: Fst_gds,
inputs: [{ name: gds, type: "gds", file: "",description: "gds file"} ,
{ name: popmap_file, type: "popmap", file: "", description: "Path to tsv file with samples group"}
],
outputs: [
{ name: fst, type: "txt", file: "*_fst.txt", description: "Weir and Cockerham weighted Fst estimate for each pop pair" },
{ name: meanfst, type: "txt", file: "*_meanfst.txt", description: "Weir and Cockerham mean Fst estimate for each pop pair" },
{ name: NJ_png, type: "png", file: "Fst_NJ_plot_mqc.png", description: "NJ tree on pop pairwise Fst" },
{ name: Fst_heatmap_png, type: "png", file: "Fst_heatmap_mqc.png", description: "NJ tree on pop pairwise Fst" },
],
options: [
{
name: "Fst_gds_threads",
prefix: "--threads",
value: 16,
min: 1,
max: 64,
step: 1,
label: "Threads to use",
type: "numeric",
},
],
},
],
script: Fst_gds.script.R,
install:
{
SNPRelate: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SNPRelate", update = TRUE, ask = FALSE)'' '
],
SeqArray: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SeqArray", update = TRUE, ask = FALSE)'' '
],
ape: [
'Rscript -e ''install.packages("ape")'' '
],
ggtree: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("ggtree", update = TRUE, ask = FALSE)'' '
]
},
citations:
{
SNPRelate: [
"Zheng X, Levine D, Shen J, Gogarten S, Laurie C, Weir B (2012). A High-performance Computing Toolset for Relatedness and Principal Component Analysis of SNP Data. Bioinformatics, 28(24), 3326-3328. doi: 10.1093/bioinformatics/bts606. "
],
SeqArray: [
"Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir B, Laurie C, Levine D (2017). SeqArray - A storage-efficient high-performance data format for WGS variant calls. Bioinformatics. doi: 10.1093/bioinformatics/btx145."
],
ape: ["Paradis E, Schliep K (2019). ape 5.0: an environment for modern phylogenetics and evolutionary analyses in R. Bioinformatics, 35, 526-528."],
ggtree: ["Yu G (2020). Using ggtree to Visualize Data on Tree-Like Structures. Current Protocols in Bioinformatics, 69(1), e96. doi: 10.1002/cpbi.96, https://currentprotocols.onlinelibrary.wiley.com/doi/abs/10.1002/cpbi.96." ]
},
}
......@@ -45,12 +45,19 @@
SNPRelate: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SNPRelate", update = TRUE, ask = FALSE)'' '
]
],
SeqArray: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SeqArray", update = TRUE, ask = FALSE)'' '
],
},
citations:
{
SNPRelate: [
"Zheng X, Levine D, Shen J, Gogarten S, Laurie C, Weir B (2012). A High-performance Computing Toolset for Relatedness and Principal Component Analysis of SNP Data. Bioinformatics, 28(24), 3326-3328. doi: 10.1093/bioinformatics/bts606. "
]
],
SeqArray: [
"Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir B, Laurie C, Levine D (2017). SeqArray - A storage-efficient high-performance data format for WGS variant calls. Bioinformatics. doi: 10.1093/bioinformatics/btx145."
],
},
}
......@@ -12,14 +12,14 @@ if (ldprune == FALSE)
seqVCF2GDS(vcffile, gdsfile, verbose=FALSE);
} else {
ld.threshold = snakemake@params[["vcf2gds_ld_threshold"]]
maf = snakemake@params[["vcf2gds_min_maf"]]
tmpgds = tempfile(pattern = "tmp", fileext = "gds")
ld.threshold = as.double(snakemake@params[["vcf2gds_ld_threshold"]] )
maf = as.double(snakemake@params[["vcf2gds_min_maf"]] )
tmpgds = tempfile(pattern = "tmp", fileext = "gds")
seqVCF2GDS(vcffile, tmpgds, verbose=FALSE);
gds = seqOpen(tmpgds)
#get a list of SNP not in LD
snpset <- snpgdsLDpruning(gds, ld.threshold=ld.threshold, maf=maf)
snpset <- snpgdsLDpruning(gds, ld.threshold=ld.threshold, maf=maf, autosome.only = FALSE, remove.monosnp = FALSE)
snpset.id <- unlist(snpset, use.names=FALSE)
#apply a filter
......
......@@ -33,7 +33,7 @@
min: 0,
max: 0.5,
step: 0.01,
label: "use the SNPs with >= maf only",
label: "if LD pruning : use the SNPs with >= maf only",
},
{
name: vcf2gds_ld_threshold,
......@@ -43,7 +43,7 @@
min: 0,
max: 1,
step: 0.01,
label: "skip SNPs with ld > to this threshold",
label: "if LD pruning : skip SNPs with ld > to this threshold",
},
{
name: "vcf2gds_threads",
......@@ -64,12 +64,20 @@
SeqArray: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SeqArray", update = TRUE, ask = FALSE)'' '
],
SNPRelate: [
'Rscript -e ''if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager");
BiocManager::install("SNPRelate", update = TRUE, ask = FALSE)'' '
]
},
citations:
{
SeqArray: [
"Zheng X, Gogarten S, Lawrence M, Stilp A, Conomos M, Weir B, Laurie C, Levine D (2017). SeqArray - A storage-efficient high-performance data format for WGS variant calls. Bioinformatics. doi: 10.1093/bioinformatics/btx145."
]
],
SNPRelate: [
"Zheng X, Levine D, Shen J, Gogarten S, Laurie C, Weir B (2012). A High-performance Computing Toolset for Relatedness and Principal Component Analysis of SNP Data. Bioinformatics, 28(24), 3326-3328. doi: 10.1093/bioinformatics/bts606. "
]
},
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment