Commit c55fe75c authored by khalid's avatar khalid
Browse files

Add necessary file to handle comparison of a list files in a directory

parent 38bbfb19
......@@ -124,6 +124,7 @@ def generate(name, path_yaml = "", path_input = DEFAULT_PATH_INPUT, path_output
# Gérer les "from params" (ex: fasta genomes)
if "params" in step:
raw_inputs = list()
for param in step["params"]:
# get input yaml
......@@ -136,8 +137,10 @@ def generate(name, path_yaml = "", path_input = DEFAULT_PATH_INPUT, path_output
if ("list" in INPUT_YAML and INPUT_YAML["list"]):
if param["input_name"] not in inputs_list_from_yaml:
result += "\tinputs[\"" + param["input_name"] + "\"] = list()\n"
if "raw_" in param["origin_command"] :
result += "\tinputs[\"" + param["input_name"] + "\"].append(" + expand_begin + param["origin_command"] + "[\"" + param["origin_name"] + "\"]" + expand_end + ")\n"
if "raw_" in param["origin_command"]:
if not param["origin_command"] in raw_inputs:
result += "\tinputs[\"" + param["input_name"] + "\"].append(" + expand_begin + param["origin_command"] + "[\"" + param["origin_name"] + "\"]" + expand_end + ")\n"
raw_inputs.append(param["origin_command"])
else:
result += "\tinputs[\"" + param["input_name"] + "\"].append(" + expand_begin + "rules." + param["origin_step"] + "__" + param["origin_command"] + ".output." + param["origin_name"] + expand_end + ")\n"
# inputs classiques
......
......@@ -6,6 +6,7 @@ def raw_vcf(results_dir, sample_dir):
samples = list()
suffixes = list()
dicoSamples = dict() # sample_name: file(s)
files = [ f for f in os.listdir(sample_dir) if not f.startswith('.') ]
regex = re.compile(r"^(.+?)(\.(bcf|vcf).*)(?<!tbi)$")
for file in files:
......
......@@ -6,7 +6,7 @@
name: "sample_dir",
type: "input_dir",
value: "/Data",
label: "Directory containing the vcf files: ",
label: "vcf file: ",
volumes: [Data: "/Data", Results: "/Results"]
},
]
......
import os
import re
import sys
def raw_vcfsinDir(results_dir, sample_dir):
samples = list()
suffixes = list()
dicoSamples = dict() # sample_name: file(s)
vcfsinDir = list()
files = [ f for f in os.listdir(sample_dir) if not f.startswith('.') ]
regex = re.compile(r"^(.+?)(\.(bcf|vcf).*)(?<!tbi)$")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
suffixes.append(res.group(2))
if res.group(1) not in dicoSamples.keys():
dicoSamples[res.group(1)] = list()
dicoSamples[res.group(1)].append(file)
vcfsinDir.append(sample_dir +"/"+ file)
if (len(set(suffixes)) == 1 ):
suffix = list(set(suffixes))[0]
with open(results_dir+"/samples.tsv","w") as sampleTab:
sampleTab.write("sample\tvcf_file")
for sample in sorted(samples):
sampleTab.write("\n"+sample+"\t"+"\t".join(sorted(dicoSamples[sample])))
out = {'samples': sorted(samples), 'sample_suffix': suffix, 'dico': dicoSamples, 'vcfsinDir': vcfsinDir}
return out
elif len(set(suffixes)) > 1:
exit("Files have different suffixes:" + ','.join(suffixes))
else:
exit("No VCF or BCF file found")
#print(raw_vcf(sys.argv[1],sys.argv[2]))
{
name: raw_vcfsinDir,
function_call: "raw_vcfsinDir(config['results_dir'], config['sample_dir'])",
options: [
{
name: "sample_dir",
type: "input_dir",
value: "/Data",
label: "Directory containing the vcf files: ",
volumes: [Data: "/Data", Results: "/Results"]
},
]
}
......@@ -13,7 +13,7 @@ if config["SeOrPe"] == "PE":
command = config["<step_name>__bwa_mem2_PE_command"],
#indexPrefix = config["<step_name>__bwa_mem2_index_output_dir"]+"/index",
#avoid index.bwt.2bit.64 index.bwt.8bit.32
indexPrefix = lambda w, input: os.path.splitext(os.path.splitext([x for x in input.index if not 'bit' in x][0])[0],
indexPrefix = lambda w, input: os.path.splitext(os.path.splitext([x for x in input.index if not 'bit' in x][0])[0])[0],
quality0_multimapping = """| awk '{{pos = index("XA:Z",$0); if (pos>0) $5=0;print}}'""" if (config["<step_name>__bwa_mem2_quality0_multimapping"] == True) else ""
shell:
"{params.command} "
......@@ -42,7 +42,7 @@ elif config["SeOrPe"] == "SE":
params:
command = config["<step_name>__bwa_mem2_PE_command"],
#indexPrefix = config["<step_name>__bwa_mem2_index_output_dir"]+"/index",
indexPrefix = lambda w, input: os.path.splitext(os.path.splitext([x for x in input.index if not 'bit' in x][0])[0],
indexPrefix = lambda w, input: os.path.splitext(os.path.splitext([x for x in input.index if not 'bit' in x][0])[0])[0],
quality0_multimapping = """| awk '{{pos = index("XA:Z",$0); if (pos>0) $5=0;print}}'""" if (config["<step_name>__bwa_mem2_quality0_multimapping"] == True) else ""
shell:
"{params.command} "
......
......@@ -8,7 +8,7 @@ rule <step_name>__compare_vcfs:
params:
command = config["<step_name>__compare_vcfs_command"],
output_dir = config["results_dir"]+"/"+config["<step_name>__compare_vcfs_output_dir"]+"/{sample}",
nb_vcfs = nb_vcfs = len(<step_name>__compare_vcfs_inputs()["vcfs"]),
nb_vcfs = nb_vcfs = len(<step_name>__compare_vcfs_inputs()["vcfsinDir"]),
truth_vcf = config["<step_name>__compare_vcfs_truth"]
log:
config["results_dir"]+"/logs/" + config["<step_name>__compare_vcfs_output_dir"]+"/{sample} + "/compare_vcfs_log.txt"
......
rule <step_name>__compare_vcfs_inDir:
input:
**<step_name>__compare_vcfs_inDir_inputs(),
truth_vcf_file = config["<step_name>__compare_vcfs_inDir_truth_file"] if config["<step_name>__compare_vcfs_inDir_truth"] else ""
output:
stats = config["results_dir"]+"/"+config["<step_name>__compare_vcfs_inDir_output_dir"]+"/{sample}+"/stats.txt",
bench_to_truth = config["results_dir"]+"/"+config["<step_name>__compare_vcfs_inDir_output_dir"]+"/{sample}+"/VCF_compare_to_truth_mqc.tsv" if config["<step_name>__compare_vcfs_inDir_truth"] else ""
params:
command = config["<step_name>__compare_vcfs_inDir_command"],
output_dir = config["results_dir"]+"/"+config["<step_name>__compare_vcfs_inDir_output_dir"]+"/{sample}",
nb_vcfs = nb_vcfs = len(<step_name>__compare_vcfs_inDir_inputs()["vcfs"]),
truth_vcf = config["<step_name>__compare_vcfs_inDir_truth"]
log:
config["results_dir"]+"/logs/" + config["<step_name>__compare_vcfs_inDir_output_dir"]+"/{sample} + "/compare_vcfs_log.txt"
run:
cp = ""
tmp_tocompare = list()
for vcf_file in input.vcfs:
dir = os.path.basename(os.path.dirname(vcf_file))
if os.path.splitext(vcf_file)[1] == ".gz":
suffix = ".vcf.gz"
else:
suffix = ".vcf"
shutil.copyfile(vcf_file,"/tmp/" + dir + suffix)
tmp_tocompare.append("/tmp/" + dir + suffix)
to_png = ""
if params.nb_vcfs < 4:
to_png += "pdftoppm -png venn" +str(params.nb_vcfs)+ ".positions.pdf > Venn_positions_mqc.png; "
to_png += "pdftoppm -png venn" +str(params.nb_vcfs)+ ".snps.pdf > Venn_snps_mqc.png; "
else:
to_png += "pdftoppm -png venn" +str(params.nb_vcfs)+ ".pdf > Venn_mqc.png; "
to_truth = ""
stats_truth = dict()
if params.truth_vcf:
for vcf in tmp_tocompare:
a = vcf.split('.')[0].split('/')
dir = a[len(a)-1]
to_truth += "mkdir -p {params.output_dir}/" + dir + " && cd $_ &&"
to_truth += "vcftoolz compare --truth " + input.truth_vcf_file + " " + vcf + " > {params.output_dir}/" + dir + "/stats.txt; "
stats_truth[dir] = "{params.output_dir}/" + dir + "/stats.txt"
summary_bench_truth = "echo \"# id: compare_to_truth\" > {output.bench_to_truth}; "
summary_bench_truth += "echo \"# section_name: 'Compare VCFs to truth'\" >> {output.bench_to_truth}; "
#summary_bench_truth += "echo \"# description: ''\" > {output.bench_to_truth}; "
summary_bench_truth += "echo \"# format: 'tsv'\" >> {output.bench_to_truth}; "
summary_bench_truth += "echo \"# plot_type: 'table'\" >> {output.bench_to_truth}; "
summary_bench_truth += "echo -n 'Variant caller\tPrecision\tRecall\tF1 score\tTrue positive calls\tFalse negative calls\tFalse positive calls\n' >> {output.bench_to_truth}; "
for variant_caller in stats_truth.keys():
summary_bench_truth += "echo -n '" + variant_caller + "\t' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'Precision' " + stats_truth[variant_caller] + " | awk '{{printf $1\"\t\"}}' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'Recall' " + stats_truth[variant_caller] + " | awk '{{printf $1\"\t\"}}' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'F1 score' " + stats_truth[variant_caller] + " | awk '{{printf $1\"\t\"}}' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'True positive calls' " + stats_truth[variant_caller] + " | awk '{{printf $1\"\t\"}}' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'False negative calls' -m 1 " + stats_truth[variant_caller] + " | awk '{{printf $1\"\t\"}}' >> {output.bench_to_truth}; "
summary_bench_truth += "grep 'False positive calls' " + stats_truth[variant_caller] + " | awk '{{print $1}}' >> {output.bench_to_truth}; "
shell(
"cd {params.output_dir}; "
"{params.command} " +
" ".join(tmp_tocompare) +
"> {output.stats} " +
"2> >(tee {log} >&2); " +
to_png +
to_truth +
summary_bench_truth
)
{
id: compare_vcfs_inDir,
name: Compare VCFs in Dir,
description: "Compare several VCF files",
version: "1.2.0",
website: "https://github.com/CFSAN-Biostatistics/vcftoolz",
git: "https://github.com/CFSAN-Biostatistics/vcftoolz",
documentation: "https://vcftoolz.readthedocs.io/en/latest/",
article: "10.21105/joss.01144",
multiqc: "custom",
commands:
[
{
name: compare_vcfs_inDir,
command: vcftoolz compare,
category: "vcf_postprocess",
output_dir: compare_vcfs_inDir,
inputs: [{ name: vcfs, type: "vcf", list: true }],
outputs: [
{ name: "stats", type: "txt", file: "stats.txt", description: "Results of vcftoolz compare" }
],
options:
[
{
name: compare_vcfs_inDir_truth,
prefix: ,
type: checkbox,
value: FALSE,
label: "Compare VCF files to a truth VCF (case of a benchmark with a known truth)",
},
{
name: compare_vcfs_inDir_truth_file,
type: input_file,
value: "",
label: "Path to a truth VCF",
},
],
},
],
install: {
vcftoolz: [
"pip3 install vcftoolz==1.2.0",
"ENV PATH ~/.local/bin:$PATH"
],
poppler-utils: [ # pdftoppm (pdf to png)
"apt -y update && apt install -y poppler-utils"
]
},
citations: {
vcftools: [
"Davis, (2019). vcftoolz: a Python package for comparing and evaluating Variant Call Format files.. Journal of Open Source Software, 4(35), 1144, https://doi.org/10.21105/joss.01144"
]
}
}
......@@ -16,7 +16,7 @@
command: bcftools isec,
category: "vcf_postprocess",
output_dir: compare_vcfs_isec,
inputs: [{ name: vcfs, type: "vcf", list: true }],
inputs: [{ name: vcfs, type: "vcf", list: true, description: "vcf files must be bziped and tabix indexed " }],
outputs: [
{ name: sites, type: "txt" , file: "{sample}/sites.txt", description: "Results of bcftools isec" },
{ name: snps, type: "txt" , file: "{sample}/snps.txt", description: "Matrix of intersections" },
......
......@@ -62,6 +62,7 @@ data: [
{name: "bai", category: "mapping"},
{name: "sam", category: "mapping"},
{name: "vcf", category: "mapping"},
{name: "vcfsinDir", category: "mapping"},
{name: "paf.gz", category: "mapping"},
{name: "aligned_fasta", category: "alignment"},
......
......@@ -12,7 +12,10 @@ rule <step_name>__salmon_index:
config["<step_name>__bwa_mem_index_output_dir"]+"/rank.bin",
config["<step_name>__bwa_mem_index_output_dir"]+"/reflengths.bin",
config["<step_name>__bwa_mem_index_output_dir"]+"/refseq.bin",
config["<step_name>__bwa_mem_index_output_dir"]+"/seq.bin"
config["<step_name>__bwa_mem_index_output_dir"]+"/seq.bin",
config["<step_name>__bwa_mem_index_output_dir"]+"/versionInfo.json",
config["<step_name>__bwa_mem_index_output_dir"]+"/duplicate_clusters.tsv",
config["<step_name>__bwa_mem_index_output_dir"]+"/info.json"
)
log:
config["results_dir"]+'/logs/' + config["<step_name>__salmon_index_output_dir"] + '/salmon_index_log.txt'
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment