Commit a1d681bc authored by mmassaviol's avatar mmassaviol
Browse files

Update RADseq_denovo

parent fad73026
...@@ -3,8 +3,6 @@ import re ...@@ -3,8 +3,6 @@ import re
import snakemake.utils import snakemake.utils
import csv import csv
dir_path = "/workflow"
############# #############
# Wildcards # # Wildcards #
############# #############
...@@ -14,6 +12,7 @@ STEPS = config["steps"] ...@@ -14,6 +12,7 @@ STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"] PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"] PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"] OUTPUTS = config["outputs"]
PARAMS_INFO = config["params_info"]
config = config["params"] config = config["params"]
def get_individus(): def get_individus():
...@@ -111,8 +110,6 @@ def prepare_report_inputs(): ...@@ -111,8 +110,6 @@ def prepare_report_inputs():
inputs = list() inputs = list()
for step in STEPS: for step in STEPS:
inputs.extend(step_outputs(step["name"])) inputs.extend(step_outputs(step["name"]))
if (step["name"] == config["final_step"]):
break
return inputs return inputs
def prepare_report_scripts(): def prepare_report_scripts():
...@@ -122,8 +119,6 @@ def prepare_report_scripts(): ...@@ -122,8 +119,6 @@ def prepare_report_scripts():
script = tool+".prepare.report.R" script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS): if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script) scripts.append("/workflow/scripts/"+script)
if (tool == config["final_step"]):
break
return scripts return scripts
def prepare_report_outputs(): def prepare_report_outputs():
...@@ -134,8 +129,6 @@ def prepare_report_outputs(): ...@@ -134,8 +129,6 @@ def prepare_report_outputs():
if (tool in PREPARE_REPORT_OUTPUTS.keys()): if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]: for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output) outputs.append(config["results_dir"]+"/"+tool+"/"+output)
if (tool == config["final_step"]):
break
return outputs return outputs
def multiqc_inputs(): def multiqc_inputs():
...@@ -559,16 +552,19 @@ rule prepare_report: ...@@ -559,16 +552,19 @@ rule prepare_report:
i+=1 i+=1
# Params list for Multiqc report # Params list for Multiqc report
params_list = "params_name\tvalue\n" params_list = "params_name\tdescription\tvalue\n"
head = """ head = """# description: 'This is the list of the parameters for each rule'
# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters' # section_name: 'Workflow parameters'
""" """
for step in STEPS: for step in STEPS:
tool = config[step["name"]] tool = config[step["name"]]
for key, value in config.items(): for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]): if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]) and ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
params_list += key + "\t'" + str(value) + "'\n" if (key in PARAMS_INFO.keys() and "label" in PARAMS_INFO[key].keys()):
description = PARAMS_INFO[key]["label"]
else:
description = ''
params_list += key + "\t'" + description + "'\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out: with open(output.params_tab,"w") as out:
out.write(head) out.write(head)
...@@ -594,7 +590,7 @@ rule multiqc: ...@@ -594,7 +590,7 @@ rule multiqc:
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps) # Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all: rule all:
input: input:
workflow_outputs(config["final_step"]) workflow_outputs("all")
output: output:
Snakefile = config["results_dir"]+"/workflow/Snakefile", Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py", get_samples = config["results_dir"]+"/workflow/get_samples.py",
......
#!/usr/bin/env python3
# This script will take a directory and a parameter to tell if the reads are paired end or single end and return the sample list and the suffix
# Needs 2 arguments: reads_directory, SeOrPe
# SeOrPe is SE for single end reads and PE for paired end reads
# Usage: ./get_samples.py reads_directory SeOrPe
import os import os
import re import re
import csv import csv
import sys import sys
def sample_list(dir, SeOrPe):
def sample_list(dir, filesuffix=".fastq.gz"):
samples = list()
files = os.listdir(dir)
regex = re.compile("^(.+?)(_R1|_R2|)("+filesuffix+")")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
return sorted(samples)
def sample_list2(dir, SeOrPe):
samples = list() samples = list()
suffixes = list() suffixes = list()
files = os.listdir(dir) files = os.listdir(dir)
if SeOrPe == "PE": if SeOrPe == "PE":
regex = re.compile("^(.+?)(_R1|_R2)(.+)") regex = re.compile(r"^(.+?)(_R1|_R2)(.+)")
else: else:
regex = re.compile("^(.+?)(\..*)") regex = re.compile(r"^(.+?)(\..*)")
for file in files: for file in files:
res = re.match(regex, file) res = re.match(regex, file)
if res: if res:
...@@ -34,25 +27,17 @@ def sample_list2(dir, SeOrPe): ...@@ -34,25 +27,17 @@ def sample_list2(dir, SeOrPe):
suffixes.append(res.group(2)) suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ): if (len(set(suffixes)) == 1 ):
return {'samples': samples, 'suffix': set(suffixes)} return {'samples': sorted(samples), 'suffix': list(set(suffixes))[0]}
else: else:
pass exit("Files have different suffixes:" + ','.join(suffixes))
def group_list(dir, groupfile, filesuffix=".fastq.gz"): def main():
samples = sample_list(dir,filesuffix) if len(sys.argv) == 3:
if os.path.isfile(groupfile): print(sample_list(sys.argv[1],sys.argv[2]))
groups = dict()
with open(groupfile, mode="r") as infile:
reader = csv.reader(infile)
groups = {row[0]: row[1] for row in reader}
return {sample: groups[sample] for sample in samples}
else: else:
return {sample: "group" for sample in samples} exit("""Needs 2 arguments: reads_directory, SeOrPe
Usage: ./get_samples.py reads_directory SeOrPe""")
#print(sample_list2(sys.argv[1],sys.argv[2]))
if len(sys.argv) == 4: if __name__ == "__main__":
print(group_list(sys.argv[1], sys.argv[2], sys.argv[3])) # execute only if run as a script
else: main()
print(group_list(sys.argv[1], sys.argv[2]))
...@@ -2,7 +2,6 @@ pipeline: RADseq_denovo ...@@ -2,7 +2,6 @@ pipeline: RADseq_denovo
params: params:
results_dir: /Results results_dir: /Results
sample_dir: /Data sample_dir: /Data
sample_suffix: .fastq.gz
SeOrPe: PE SeOrPe: PE
quality_check: fastqc quality_check: fastqc
fastqc_SE_output_dir: fastqc_SE fastqc_SE_output_dir: fastqc_SE
...@@ -55,7 +54,6 @@ params: ...@@ -55,7 +54,6 @@ params:
populations_p: 2 populations_p: 2
samples: [] samples: []
groups: [] groups: []
final_step: all
steps: steps:
- title: Quality check - title: Quality check
name: quality_check name: quality_check
...@@ -104,14 +102,13 @@ params_info: ...@@ -104,14 +102,13 @@ params_info:
type: output_dir type: output_dir
sample_dir: sample_dir:
type: input_dir type: input_dir
sample_suffix:
type: text
SeOrPe: SeOrPe:
type: radio type: radio
fastqc_threads: fastqc_threads:
tool: fastqc tool: fastqc
rule: fastqc_PE rule: fastqc_PE
type: numeric type: numeric
label: Number of threads to use
process_radtags_barcode_file_select: process_radtags_barcode_file_select:
tool: process_radtags tool: process_radtags
rule: process_radtags_PE rule: process_radtags_PE
...@@ -120,42 +117,53 @@ params_info: ...@@ -120,42 +117,53 @@ params_info:
tool: process_radtags tool: process_radtags
rule: process_radtags_PE rule: process_radtags_PE
type: input_file type: input_file
label: Barcode file
process_radtags_barcode_type: process_radtags_barcode_type:
tool: process_radtags tool: process_radtags
rule: process_radtags_PE rule: process_radtags_PE
type: select type: select
label: Barcode position
process_radtags_enzyme_SE: process_radtags_enzyme_SE:
tool: process_radtags tool: process_radtags
rule: process_radtags_SE rule: process_radtags_SE
type: select type: select
label: Provide the restriction enzyme used
process_radtags_enzyme_1_PE: process_radtags_enzyme_1_PE:
tool: process_radtags tool: process_radtags
rule: process_radtags_PE rule: process_radtags_PE
type: select type: select
label: Provide the restriction enzyme used
process_radtags_enzyme_2_PE: process_radtags_enzyme_2_PE:
tool: process_radtags tool: process_radtags
rule: process_radtags_PE rule: process_radtags_PE
type: select type: select
label: If a double digest was used, provide the second restriction enzyme used
ustacks_threads: ustacks_threads:
tool: ustacks tool: ustacks
rule: ustacks rule: ustacks
type: numeric type: numeric
label: Number of threads to use
ustacks_M: ustacks_M:
tool: ustacks tool: ustacks
rule: ustacks rule: ustacks
type: numeric type: numeric
label: Maximum distance (in nucleotides) allowed between stacks
ustacks_m: ustacks_m:
tool: ustacks tool: ustacks
rule: ustacks rule: ustacks
type: numeric type: numeric
label: Minimum depth of coverage required to create a stack
ustacks_N: ustacks_N:
tool: ustacks tool: ustacks
rule: ustacks rule: ustacks
type: numeric type: numeric
label: 'Maximum distance allowed to align secondary reads to primary stacks (default:
M + 2)'
cstacks_threads: cstacks_threads:
tool: cstacks tool: cstacks
rule: cstacks rule: cstacks
type: numeric type: numeric
label: Number of threads to use
cstacks_population_tsv_select: cstacks_population_tsv_select:
tool: cstacks tool: cstacks
rule: cstacks rule: cstacks
...@@ -164,22 +172,27 @@ params_info: ...@@ -164,22 +172,27 @@ params_info:
tool: cstacks tool: cstacks
rule: cstacks rule: cstacks
type: input_file type: input_file
label: Path to population tsv file
cstacks_n: cstacks_n:
tool: cstacks tool: cstacks
rule: cstacks rule: cstacks
type: numeric type: numeric
label: Number of mismatches allowed between sample loci when build the catalog.
sstacks_threads: sstacks_threads:
tool: sstacks tool: sstacks
rule: sstacks rule: sstacks
type: numeric type: numeric
label: Number of threads to use
tsv2bam_threads: tsv2bam_threads:
tool: tsv2bam tool: tsv2bam
rule: tsv2bam rule: tsv2bam
type: numeric type: numeric
label: Number of threads to use
gstacks_threads: gstacks_threads:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: numeric type: numeric
label: Number of threads to use
gstacks_population_tsv_select: gstacks_population_tsv_select:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
...@@ -188,46 +201,60 @@ params_info: ...@@ -188,46 +201,60 @@ params_info:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: input_file type: input_file
label: Path to population tsv file
gstacks_model: gstacks_model:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: select type: select
label: Model to use to call variants and genotypes
gstacks_var_alpha: gstacks_var_alpha:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: numeric type: numeric
label: Alpha threshold for discovering SNPs
gstacks_gt_alpha: gstacks_gt_alpha:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: numeric type: numeric
label: Alpha threshold for calling genotypes
gstacks_min_mapq: gstacks_min_mapq:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: numeric type: numeric
label: Minimum PHRED-scaled mapping quality to consider a read
gstacks_max_clipped: gstacks_max_clipped:
tool: gstacks tool: gstacks
rule: gstacks rule: gstacks
type: numeric type: numeric
label: Maximum soft-clipping level, in fraction of read length
populations_threads: populations_threads:
tool: populations tool: populations
rule: populations rule: populations
type: numeric type: numeric
label: Number of threads to use
populations_r: populations_r:
tool: populations tool: populations
rule: populations rule: populations
type: numeric type: numeric
label: Minimum percentage of individuals in a population required to process a
locus for that population
populations_max_obs_het: populations_max_obs_het:
tool: populations tool: populations
rule: populations rule: populations
type: numeric type: numeric
label: Specify a maximum observed heterozygosity required to process a nucleotide
site at a locus
populations_min_maf: populations_min_maf:
tool: populations tool: populations
rule: populations rule: populations
type: numeric type: numeric
label: Specify a minimum minor allele frequency required to process a nucleotide
site at a locus
populations_p: populations_p:
tool: populations tool: populations
rule: populations rule: populations
type: numeric type: numeric
label: Minimum number of populations a locus must be present in to process a locus
prepare_report_scripts: prepare_report_scripts:
- populations.prepare.report.R - populations.prepare.report.R
prepare_report_outputs: prepare_report_outputs:
......
...@@ -21,20 +21,19 @@ PCA1 <- snpgdsPCA(genofile, snp.id=NULL, maf=NaN, missing.rate=0.2, num.thread= ...@@ -21,20 +21,19 @@ PCA1 <- snpgdsPCA(genofile, snp.id=NULL, maf=NaN, missing.rate=0.2, num.thread=
#PCA eigenvalues #PCA eigenvalues
fic = paste(parameters$results_dir,parameters$populations_output_dir,'PCA_Eigenvalues_mqc.txt',sep = "/") fic = paste(parameters$results_dir,parameters$populations_output_dir,'PCA_Eigenvalues_mqc.txt',sep = "/")
cat(" cat("# id: custom_bargraph_tsv
# id: custom_bargraph_tsv # section_name: 'PCA eigenvalues'
# section_name: 'PCA eigenvalues' # description: 'valeurs propres (variance expliquée par chaque axe).'
# description: 'valeurs propres (variance expliquée par chaque axe).' # format: 'tsv'
# format: 'tsv' # plot_type: 'bargraph'
# plot_type: 'bargraph' # pconfig:
# pconfig: # id: 'custom_bargraph_w_header'
# id: 'custom_bargraph_w_header' # title: PCA eigenvalues
# title: PCA eigenvalues # ylab: 'Percent'\n", file=fic)
# ylab: 'Percent'\n", file=fic)
for (i in 1: 20 ) for (i in 1: 20 )
{ {
cat(i,'\t',PCA1$eigenval[i],'\n', file=fic, append=T) cat(i,'\t',PCA1$eigenval[i],'\n', file=fic, append=T)
} }
sample.id1 <- PCA1$sample.id sample.id1 <- PCA1$sample.id
...@@ -43,9 +42,9 @@ pop_code1= popmap[which(popmap[,1] %in% sample.id1), 2] ...@@ -43,9 +42,9 @@ pop_code1= popmap[which(popmap[,1] %in% sample.id1), 2]
nbpops = length(unique(pop_code1)) nbpops = length(unique(pop_code1))
if ( nbpops <= 11) { if ( nbpops <= 11) {
popc <- brewer.pal(n = nbpops, name = palette) popc <- brewer.pal(n = nbpops, name = palette)
} else { } else {
popc <- colorRampPalette(brewer.pal(11, palette))( nbpops) popc <- colorRampPalette(brewer.pal(11, palette))( nbpops)
} }
#popc <- brewer.pal(n = length(unique(pop_code1)), name = 'Spectral') #popc <- brewer.pal(n = length(unique(pop_code1)), name = 'Spectral')
...@@ -76,45 +75,45 @@ data: \n", file=fic) ...@@ -76,45 +75,45 @@ data: \n", file=fic)
for (i in 1:length(tab1$sample.id)) for (i in 1:length(tab1$sample.id))
{ {
cat(" ", i,":\n",file=fic, append=T) cat(" ", i,":\n",file=fic, append=T)
cat(" x:",tab1$EV2[i],"\n",file=fic, append=T) cat(" x:",tab1$EV2[i],"\n",file=fic, append=T)
cat(" y:",tab1$EV1[i],"\n",file=fic, append=T) cat(" y:",tab1$EV1[i],"\n",file=fic, append=T)
cat(" color: '",pop_color1[tab1$sample.id[i]],"'\n",file=fic, append=T) cat(" color: '",pop_color1[tab1$sample.id[i]],"'\n",file=fic, append=T)
} }
#FST par paire #FST par paire
cat("#plot_type: 'heatmap'\n\t",file = paste(parameters$results_dir,parameters$populations_output_dir,'Mean_Pairwise_Pop_FST_mqc.csv',sep = "/")) tryCatch({
fst_summary = read.csv2(paste(parameters$results_dir,parameters$populations_output_dir,'populations.fst_summary.tsv',sep = "/"), sep = "\t",row.names = 1, stringsAsFactors=F) cat("#plot_type: 'heatmap'\n\t",file = paste(parameters$results_dir,parameters$populations_output_dir,'Mean_Pairwise_Pop_FST_mqc.csv',sep = "/"))
if(nrow(fst_summary) > 0) fst_summary = read.csv2(paste(parameters$results_dir,parameters$populations_output_dir,'populations.fst_summary.tsv',sep = "/"), sep = "\t",row.names = 1, stringsAsFactors=F)
{ if(nrow(fst_summary) > 0){
for (i in 1: nrow(fst_summary)) for (i in 1: nrow(fst_summary))
{ {
fst_summary[i,i] = 0 fst_summary[i,i] = 0
for (j in 1: i) for (j in 1: i) {fst_summary[i,j] = fst_summary[j,i]}
{fst_summary[i,j] = fst_summary[j,i]} }
} }
} write.table(fst_summary, sep='\t',file = paste(parameters$results_dir,parameters$populations_output_dir,'Mean_Pairwise_Pop_FST_mqc.csv',sep = "/"), quote=F, row.names=T, col.names=T ,append=TRUE)
}, error = function(e) {
write.table(fst_summary, sep='\t',file = paste(parameters$results_dir,parameters$populations_output_dir,'Mean_Pairwise_Pop_FST_mqc.csv',sep = "/"), quote=F, row.names=T, col.names=T ,append=TRUE) cat("",file = paste(parameters$results_dir,parameters$populations_output_dir,'Mean_Pairwise_Pop_FST_mqc.csv',sep = "/"))
})
#IBS #IBS
tryCatch({ tryCatch({
ibs <- snpgdsIBS(genofile, sample.id=sample.id1, snp.id=NULL, maf=0.1, missing.rate=0.05, num.thread=2, verbose=FALSE, autosome.only=FALSE) ibs <- snpgdsIBS(genofile, sample.id=sample.id1, snp.id=NULL, maf=0, missing.rate=1, num.thread=2, verbose=TRUE, autosome.only=FALSE)
colnames(ibs$ibs)=ibs$sample.id colnames(ibs$ibs)=ibs$sample.id
rownames(ibs$ibs)=ibs$sample.id rownames(ibs$ibs)=ibs$sample.id
png(filename = paste(parameters$results_dir,parameters$populations_output_dir,"IBS_mqc.png",sep = "/"), height=800, width=800)
png(filename = paste(parameters$results_dir,parameters$populations_output_dir,"IBS_mqc.png",sep = "/"), height=800, width=800) heatmap.2(ibs$ibs, col=terrain.colors(20),RowSideColors = pop_color1[ibs$sample.id], ColSideColors = pop_color1[ibs$sample.id], trace="none", cexRow = 1, cexCol = 1)
heatmap.2(ibs$ibs, col=terrain.colors(20),RowSideColors = pop_color1[ibs$sample.id], ColSideColors = pop_color1[ibs$sample.id], trace="none", cexRow = 1, cexCol = 1) par(lend = 1) # square line ends for the color legend
par(lend = 1) # square line ends for the color legend legend(0.0, 1.0, ncol=2, legend = names(popc), col = popc, lty= 1,lwd = 5, bty="n")
legend(0.0, 1.0, ncol=2, legend = names(popc), col = popc, lty= 1,lwd = 5, bty="n") dev.off()
dev.off()