Commit 219797f5 authored by mmassaviol's avatar mmassaviol
Browse files

Update workflow (add filtering and stats to lastz and outputs for mqc report)

parent c7aa5233
......@@ -40,6 +40,15 @@ RUN cd /opt/biotools \
&& cd ../.. \
&& rm -r lastz-distrib-1.04.03 lastz-1.04.03.tar.gz
RUN cd /opt/biotools \
&& wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 \
&& tar -xvjf samtools-1.9.tar.bz2 \
&& cd samtools-1.9 \
&& ./configure && make \
&& cd .. \
&& mv samtools-1.9/samtools bin/samtools \
&& rm -r samtools-1.9 samtools-1.9.tar.bz2
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
......
from tools import *
from raw_reads import raw_reads
# File generate with generate_workflow_snakefile.py
workdir: config['params']['results_dir']
import os
import re
import snakemake.utils
......@@ -267,7 +266,7 @@ rule flash:
extendedFrags = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.extendedFrags.fastq",
notCombined_1 = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.notCombined_1.fastq",
notCombined_2 = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.notCombined_2.fastq",
hist = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.hist",
hist = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.flash.hist",
histogram = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.histogram",
params:
output_dir = config["results_dir"] + "/" + config["flash_output_dir"],
......@@ -275,6 +274,7 @@ rule flash:
min_overlap = config["flash_min_overlap"],
max_overlap = config["flash_max_overlap"],
max_mismatch_density = config["flash_max_mismatch_density"],
tmp_hist = config["results_dir"] + "/" + config["flash_output_dir"] + "/{sample}.hist", # output from flash to be renamed for multiqc detection
log:
config["results_dir"]+"/logs/flash/{sample}_flash_log.txt"
threads: config["flash_threads"]
......@@ -288,7 +288,8 @@ rule flash:
"-t {threads} "
"{input.read} "
"{input.read2} "
"|& tee {log}"
"|& tee {log}; "
"mv {params.tmp_hist} {output.hist}" # with flash in hist name it is detected by multiqc
......@@ -297,7 +298,7 @@ rule demultiplexing_astrid_cruaud:
**demultiplexing_astrid_cruaud_inputs()
output:
demultiplexed = expand(config["results_dir"] + "/" + config["demultiplexing_astrid_cruaud_output_dir"] + "/{sample}_fq", sample=INDIVIDUS),
stats = config["results_dir"] + "/" + config["demultiplexing_astrid_cruaud_output_dir"] + "/stat_demultiplexing.txt",
stats = config["results_dir"] + "/" + config["demultiplexing_astrid_cruaud_output_dir"] + "/stat_demultiplexing_mqc.csv",
params:
output_dir = config["results_dir"] + "/" + config["demultiplexing_astrid_cruaud_output_dir"],
barcodes = config["demultiplexing_astrid_cruaud_barcodes"]
......@@ -310,7 +311,8 @@ rule demultiplexing_astrid_cruaud:
# tail to remove header (sample barcode_P1 barcode_P2 barcode_P2_rev term)
"tail -n +2 {params.barcodes} | parallel --jobs {threads} --colsep '\\t' \"grep -B 1 --no-group-separator -i '^{{2}}.*{{4}}$' {input.extended_frags} > {{1}}'_fq'\" |& tee {log} ; " # test if --no-group-separator works with the next parts of the workflow
# nb reads/sample :
"for i in $(ls *_fq) ; do echo -e \"$i\" '\\t' $(grep -ic '@' $i) ; done > {output.stats} 2>> {log}; " # count with grep @ because no risk of @ in quality sequence as it is no more present in the file
"echo -e \"# description: 'Number of reads for each individu after demultiplexing'\\n# section_name: 'Demultiplexing stats'\\n# plot_type: 'table'\\n# pconfig:\\n#\\tformat: '{{:,.0f}}'\\nIndividu\\tNumber of reads\" > {output.stats}; "
"for i in $(ls *_fq); do echo -e $i'\\t'$(grep -ic '@' $i); done >> {output.stats} 2>> {log}; " # count with grep @ because no risk of @ in quality sequence as it is no more present in the file
# remove barcodes from beginning / end of reads
"tail -n +2 {params.barcodes} | parallel --jobs {threads} --colsep '\\t' \"sed -i 's/^{{2}}//g;s/{{4}}$//g' {{1}}'_fq'\" |& tee -a {log}"
......@@ -350,6 +352,8 @@ rule lastz:
reference_fasta = config["lastz_reference_fasta"],
output:
align_on_ref = config["results_dir"] + "/" + config["lastz_output_dir"] + "/{sample}.vs_probes_default.sam",
filtered = config["results_dir"] + "/" + config["lastz_output_dir"] + "/filtered/{sample}.vs_probes_default.filtered.sam",
stats = config["results_dir"] + "/" + config["lastz_output_dir"] + "/stats/{sample}_lastz_sam_stats.tsv",
params:
output_dir = config["results_dir"] + "/" + config["lastz_output_dir"]+ "/",
command = config["lastz_command"]
......@@ -363,11 +367,35 @@ rule lastz:
"--format=softsam "
"--ambiguous=iupac "
"--output='{output.align_on_ref}' "
"|& tee {log}"
"|& tee {log}; "
# stats
"NB_ENTRIES=$(samtools view {output.align_on_ref} | wc -l); "
"if [ $NB_ENTRIES -gt 0 ]; then "
"NB_LOCUS=$(samtools view {output.align_on_ref} | cut -f 3 | sort -u | wc -l); "
"LIST_MULTI_LOCUS_MAPPED=$(samtools view {output.align_on_ref} | cut -f 1,3 | uniq | awk 'BEGIN {{ FS=OFS=SUBSEP=\"\\t\"; RS=\"\\n\"}}{{arr[$2]+=1 }}END {{for (i in arr) if (arr[i] > 1) print i}}' -); "
"NB_MULTI_LOCUS_MAPPED=$(echo $LIST_MULTI_LOCUS_MAPPED | wc -w); "
"LIST_MULTI_CLUSTER_MAPPED=$(samtools view {output.align_on_ref} | cut -f 1,3 | uniq | awk 'BEGIN {{ FS=OFS=SUBSEP=\"\\t\"; RS=\"\\n\"}}{{arr[$1]+=1 }}END {{for (i in arr) if (arr[i] > 1) print i}}' -); "
"NB_MULTI_CLUSTER_MAPPED=$(echo $LIST_MULTI_CLUSTER_MAPPED | wc -w); "
"printf '%s\\n' \"${{LIST_MULTI_CLUSTER_MAPPED[@]}}\" > /tmp/{wildcards.sample}_lastz.tmp; "
"printf '%s\\n' \"${{LIST_MULTI_LOCUS_MAPPED[@]}}\" >> /tmp/{wildcards.sample}_lastz.tmp; "
"sed -i '/^$/d' /tmp/{wildcards.sample}_lastz.tmp; "
# filter locus and clusters multimapped
"samtools view -h {output.align_on_ref} | grep -v -f /tmp/{wildcards.sample}_lastz.tmp - > {output.filtered}; "
"else "
"NB_LOCUS=0;"
"NB_MULTI_LOCUS_MAPPED=0;"
"NB_MULTI_CLUSTER_MAPPED=0;"
"touch {output.filtered}; "
"fi;\n"
"NB_ENTRIES_AFTER_FILTER=$(samtools view {output.filtered} | wc -l); "
"echo -e NB_ENTRIES'\\t'NB_LOCUS'\\t'NB_MULTI_LOCUS_MAPPED'\\t'NB_MULTI_CLUSTER_MAPPED'\\t'NB_ENTRIES_AFTER_FILTER'\\n'$NB_ENTRIES'\\t'$NB_LOCUS'\\t'$NB_MULTI_LOCUS_MAPPED'\\t'$NB_MULTI_CLUSTER_MAPPED'\\t'$NB_ENTRIES_AFTER_FILTER > {output.stats}; "
......@@ -393,7 +421,7 @@ rule prepare_report:
if (os.path.splitext(script)[1] in [".R",".r"]):
shell("Rscript "+script+" {params.params_file} |& tee {log}")
elif (os.path.splitext(script)[1] in [".py"]):
shell("python "+script+" {params.params_file} |& tee {log}")
shell("python3 "+script+" {params.params_file} |& tee {log}")
elif (os.path.splitext(script)[1] in [".sh"]):
shell("/bin/bash "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
......
......@@ -41,3 +41,8 @@ lastz:
lastz:
- 'Zerbino, D. R., & Birney, E. (2008). lastz: algorithms for de novo short read
assembly using de Bruijn graphs. Genome research, 18(5), 821-829'
samtools:
- "Heng Li, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor\
\ Marth, Goncalo Abecasis, Richard Durbin, 1000 Genome Project Data Processing\
\ Subgroup, The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume\
\ 25, Issue 16, 15 August 2009, Pages 2078\u20132079, https://doi.org/10.1093/bioinformatics/btp352"
......@@ -4,6 +4,23 @@ from tools import read_yaml
config = read_yaml(sys.argv[1])
def files_or_dirs_to_ignore():
# files
res = "fn_ignore_files:\n"
file_ignore = ["*.fa","*.fasta","*.fa.gz","*.fasta.gz","*.fq","*.fastq","*.fq.gz","*.fastq.gz","*.sam","*.bam","*.gtf","*.gtf.gz","*.vcf","*.vcf.gz"]
for file in file_ignore:
res += " - '" + file + "'\n"
res += "\n"
res += "fn_ignore_dirs:\n"
dirs_ignore = ["workflow"]
for dir in dirs_ignore:
res += " - '" + dir + "'\n"
res += "\n"
return res
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
......@@ -38,10 +55,13 @@ def report_section_order():
def main():
res = ""
res += report_section_order()
res += files_or_dirs_to_ignore()
res += "\nremove_sections:\n"
res += " - 'flash-bargraph'\n"
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
main()
......@@ -153,11 +153,11 @@ params_info:
tool: velvet
rule: velvet
type: text
label: "hash_length\t: EITHER an odd integer (if even, it will be decremented)\
\ <= 31 (if above, will be reduced)\\nOR: m,M,s where m and M are odd integers\
\ (if not, they will be decremented) with m < M <= 31 (if above, will be reduced)\
\ and s is a step (even number). Velvet will then hash from k=m to k=M with\
\ a step of s"
label: 'hash_length : EITHER an odd integer (if even, it will be decremented)
<= 31 (if above, will be reduced) OR: m,M,s where m and M are odd integers (if
not, they will be decremented) with m < M <= 31 (if above, will be reduced)
and s is a step (even number). Velvet will then hash from k=m to k=M with a
step of s'
velvet_min_contig_lgth:
tool: velvet
rule: velvet
......@@ -173,8 +173,14 @@ params_info:
rule: lastz
type: input_file
label: Path to reference fasta file
prepare_report_scripts: []
prepare_report_outputs: {}
prepare_report_scripts:
- flash.prepare.report.py
- lastz.prepare.report.py
prepare_report_outputs:
flash:
- flash_combinations_mqc.csv
lastz:
- lastz_stats_mqc.csv
outputs:
fastqc:
fastqc_SE:
......
......@@ -3,7 +3,7 @@ tabassembling = fluidPage(
box(title = "Parameters :", width = 12, status = "primary", collapsible = TRUE, solidHeader = TRUE,
hidden(textInput("selectassembling", label = "", value="velvet")),box(title = "Velvet", width = 12, status = "success", collapsible = TRUE, solidHeader = TRUE,
textInput("velvet_hash_length", label = "hash_length : EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced)\nOR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s", value = "31", width = "auto"),
textInput("velvet_hash_length", label = "hash_length : EITHER an odd integer (if even, it will be decremented) <= 31 (if above, will be reduced) OR: m,M,s where m and M are odd integers (if not, they will be decremented) with m < M <= 31 (if above, will be reduced) and s is a step (even number). Velvet will then hash from k=m to k=M with a step of s", value = "31", width = "auto"),
numericInput("velvet_min_contig_lgth", label = "-min_contig_lgth : minimum contig length exported to contigs.fa file (default: hash length * 2)", min = 0, max = NA, step = 1, width = "auto", value = NA),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment