Commit 4cd867b0 authored by mmassaviol's avatar mmassaviol
Browse files

Add workflow Long_Read_Assembly (in tests)

parent 051ab5e0
import os
import re
import snakemake.utils
import csv
#############
# Wildcards #
#############
SAMPLES = config["samples"]
STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"]
PARAMS_INFO = config["params_info"]
config = config["params"]
##########
# Inputs #
##########
# Generic input functions
## get raw_reads
def raw_reads():
inputs = dict()
inputs["read"] = expand(config['sample_dir']+'/{sample}'+config["sample_suffix"],sample=SAMPLES)
return inputs
# Tools inputs functions
def minimap2_overlap_self_inputs():
inputs = dict()
inputs = raw_reads()
return inputs
def miniasm_inputs():
inputs = dict()
inputs["reads"] = raw_reads()["read"]
inputs["paf"] = rules.minimap2_overlap_self.output.reads_overlaps
return inputs
def minimap2_reference_inputs():
inputs = dict()
inputs["reads"] = raw_reads()["read"]
inputs["fasta"] = rules.miniasm.output.assembly_fasta
return inputs
def racon_inputs():
inputs = dict()
inputs["reads"] = raw_reads()["read"]
inputs["assembly"] = rules.miniasm.output.assembly_fasta
inputs["overlaps"] = rules.minimap2_reference.output.reads_mapping
return inputs
def medaka_inputs():
inputs = dict()
inputs["reads"] = raw_reads()["read"]
inputs["assembly_fasta"] = rules.racon.output.assembly_corrected
return inputs
def prepare_report_inputs():
inputs = list()
for step in STEPS:
inputs.extend(step_outputs(step["name"]))
return inputs
def prepare_report_scripts():
scripts = list()
for step in STEPS:
tool = config[step["name"]]
script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script)
return scripts
def prepare_report_outputs():
outputs = list()
outputs.append(config["results_dir"] + "/outputs_mqc.csv")
for step in STEPS:
tool = config[step["name"]]
if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output)
return outputs
def multiqc_inputs():
# Need prepare_report inputs and outputs in case prepare_reports has no outputs
return prepare_report_outputs()
###########
# Outputs #
###########
def step_outputs(step):
outputs = list()
if (step == "find_overlaps"):
outputs = rules.minimap2_overlap_self.output
if (step == "assembly"):
outputs = rules.miniasm.output
if (step == "mapping"):
outputs = rules.minimap2_reference.output
if (step == "correction"):
outputs = rules.racon.output
if (step == "polishing"):
outputs = rules.medaka.output
if (step == "all"):
outputs = list(rules.multiqc.output)
return outputs
# get outputs for each choosen tools
def workflow_outputs(step):
outputs = list()
outputs.extend(step_outputs(step))
return outputs
#########
# Rules #
#########
rule minimap2_overlap_self:
input:
**minimap2_overlap_self_inputs(),
output:
reads_overlaps = config["results_dir"] + "/" + config["minimap2_overlap_self_output_dir"] + "/reads.paf.gz",
params:
output_dir = config["results_dir"] + "/" + config["minimap2_overlap_self_output_dir"]+ "/",
pacbio_oxfordNanopore = config["minimap2_overlap_self_pacbio_oxfordNanopore"],
log:
config["results_dir"] + "/logs/minimap2_overlap_self/minimap2_overlap_self_log.txt"
threads:
config["minimap2_overlap_self_threads"]
shell:
"minimap2 -x ava-{params.pacbio_oxfordNanopore} "
"-t {threads} "
"{input.read} "
"{input.read} "
"| gzip -1 > {output.reads_overlaps} "
"|& tee {log}"
rule miniasm:
input:
**miniasm_inputs(),
output:
assembly_gfa = config["results_dir"] + "/" + config["miniasm_output_dir"] + "/reads.gfa",
assembly_fasta = config["results_dir"] + "/" + config["miniasm_output_dir"] + "/reads.fasta",
params:
output_dir = config["results_dir"] + "/" + config["miniasm_output_dir"]+ "/",
log:
config["results_dir"] + "/logs/miniasm/miniasm_log.txt"
shell:
"miniasm "
"-f {input.reads} "
"{input.paf} "
"> {output.assembly_gfa} "
"|& tee {log} "
"&& awk '$1 ~/S/ {{print \">\"$2\"\\n\"$3}}' {output.assembly_gfa} > {output.assembly_fasta} "
rule minimap2_reference:
input:
**minimap2_reference_inputs(),
output:
reads_mapping = config["results_dir"] + "/" + config["minimap2_reference_output_dir"] + "/reads.paf.gz",
params:
output_dir = config["results_dir"] + "/" + config["minimap2_reference_output_dir"]+ "/",
pacbio_oxfordNanopore = config["minimap2_overlap_self_pacbio_oxfordNanopore"],
log:
config["results_dir"] + "/logs/minimap2_reference/minimap2_reference_log.txt"
threads:
config["minimap2_reference_threads"]
shell:
"minimap2 -x map-{params.pacbio_oxfordNanopore} "
"-t {threads} "
"{input.fasta} "
"{input.reads} "
"| gzip -1 > {output.reads_mapping} "
"|& tee {log}"
rule racon:
input:
**racon_inputs(),
output:
assembly_corrected = config["results_dir"] + "/" + config["racon_output_dir"] + "/assembly_corrected.fasta",
params:
output_dir = config["results_dir"] + "/" + config["racon_output_dir"]+ "/",
log:
config["results_dir"] + "/logs/racon/racon_log.txt"
threads:
config["racon_threads"]
shell:
"racon "
"-t {threads} "
"{input.reads} "
"{input.overlaps} "
"{input.assembly} "
"> {output.assembly_corrected} "
"|& tee {log}"
rule medaka:
input:
**medaka_inputs(),
output:
consensus_assembly = config["results_dir"] + "/" + config["medaka_output_dir"] + "/consensus_assembly.fasta",
params:
output_dir = config["results_dir"] + "/" + config["medaka_output_dir"]+ "/",
model = config["medaka_model"],
log:
config["results_dir"] + "/logs/medaka/medaka_log.txt"
threads:
config["medaka_threads"]
shell:
"medaka_consensus "
"-i {input.reads} "
"-d {input.assembly_fasta} "
"-o {params.output_dir} "
"-t {threads} "
"-m {params.model} "
"|& tee {log}"
rule prepare_report:
input:
*prepare_report_inputs(),
output:
*prepare_report_outputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml",
params_tab = config["results_dir"] + "/params_tab_mqc.csv"
params:
params_file = config["results_dir"]+"/params.yml",
results_dir = config["results_dir"]
log:
config["results_dir"]+"/logs/prepare_report_log.txt"
run:
# Specific scripts for each tool
for script in prepare_report_scripts():
shell("Rscript "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
outfile = config["results_dir"] + "/outputs_mqc.csv"
head = """
# description: 'This is the list of the files generated by each step of the workflow'
# section_name: 'Workflow outputs'
"""
with open(outfile,"w") as out:
out.write(head)
out.write("step\ttool\tfile\tdescription\n")#\tname
for step in STEPS:
tool = config[step["name"]]
i=1
for command in OUTPUTS[tool]:
if ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
outputs = OUTPUTS[tool][command]
for files in outputs:
path = config[command+"_output_dir"] + "/" + files["file"] #config["results_dir"] +"/"+
out.write(str(i)+"-"+step["title"]+"\t"+tool+"\t"+path+"\t"+files["description"]+"\n")#"\t"+files["name"]+
i+=1
# Params list for Multiqc report
params_list = "params_name\tdescription\tvalue\n"
head = """# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters'
"""
for step in STEPS:
tool = config[step["name"]]
for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]) and ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
if (key in PARAMS_INFO.keys() and "label" in PARAMS_INFO[key].keys()):
description = PARAMS_INFO[key]["label"]
else:
description = ''
params_list += key + "\t'" + description + "'\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out:
out.write(head)
out.write(params_list)
# Config for Multiqc report
shell("python3 /workflow/generate_multiqc_config.py {params.params_file} {output.config_multiqc}")
rule multiqc:
input:
multiqc_inputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml"
output:
multiqc_dir = directory(config["results_dir"]+"/multiqc_data")
params:
output_dir = config["results_dir"]
log:
config["results_dir"]+'/logs/multiqc/multiqc_log.txt'
shell:
"multiqc --config {input.config_multiqc} -f {params.output_dir} "
"-o {params.output_dir} |& tee {log}"
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all:
input:
workflow_outputs("all")
output:
Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py",
scripts = directory(config["results_dir"]+"/workflow/scripts"),
params = config["results_dir"]+"/workflow/params.yml"
params:
params_file = config["results_dir"]+"/params.yml",
shell:
"cp /workflow/Snakefile {output.Snakefile} && "
"cp /workflow/get_samples.py {output.get_samples} && "
"cp -r /workflow/scripts {output.scripts} && "
"cp {params.params_file} {output.params}"
onsuccess:
print("Workflow finished, no error")
shell("touch "+config["results_dir"]+"/logs/workflow_end.ok")
onerror:
print("An error occurred")
shell("cat {log} > "+config["results_dir"]+"/logs/workflow_end.error")
#shell("mail -s "an error occurred" youremail@provider.com < {log}")
import re
import sys
from tools import *
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
#!/usr/bin/env python3
# This script will take a directory and a parameter to tell if the reads are paired end or single end and return the sample list and the suffix
# Needs 2 arguments: reads_directory, SeOrPe
# SeOrPe is SE for single end reads and PE for paired end reads
# Usage: ./get_samples.py reads_directory SeOrPe
import os
import re
import csv
import sys
def sample_list(dir, SeOrPe):
samples = list()
suffixes = list()
files = os.listdir(dir)
if SeOrPe == "PE":
regex = re.compile(r"^(.+?)(_R1|_R2)(.+)")
else:
regex = re.compile(r"^(.+?)(\..*)")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
if SeOrPe == "PE":
suffixes.append(res.group(3))
else:
suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ):
return {'samples': sorted(samples), 'suffix': list(set(suffixes))[0]}
else:
exit("Files have different suffixes:" + ','.join(suffixes))
def main():
if len(sys.argv) == 3:
print(sample_list(sys.argv[1],sys.argv[2]))
else:
exit("""Needs 2 arguments: reads_directory, SeOrPe
Usage: ./get_samples.py reads_directory SeOrPe""")
if __name__ == "__main__":
# execute only if run as a script
main()
pipeline: Long_Read_Assembly
params:
results_dir: /Results
sample_dir: /Data
SeOrPe: SE
find_overlaps: minimap2_overlap_self
minimap2_overlap_self_output_dir: minimap2_overlap_self
minimap2_overlap_self_threads: 4
minimap2_overlap_self_pacbio_oxfordNanopore: bwtsw
assembly: miniasm
miniasm_output_dir: miniasm
mapping: minimap2_reference
minimap2_reference_output_dir: minimap2_reference
minimap2_reference_threads: 4
correction: racon
racon_output_dir: racon
racon_threads: 4
polishing: medaka
medaka_output_dir: medaka
medaka_threads: 4
medaka_model: r941_min_high
samples: []
groups: []
steps:
- title: Find overlaps
name: find_overlaps
tools:
- minimap2_overlap_self
default: minimap2_overlap_self
- title: Assembly
name: assembly
tools:
- miniasm
default: miniasm
- title: Mapping
name: mapping
tools:
- minimap2_reference
default: minimap2_reference
- title: Correction
name: correction
tools:
- racon
default: racon
- title: Polishing
name: polishing
tools:
- medaka
default: medaka
params_info:
results_dir:
type: output_dir
sample_dir:
type: input_dir
SeOrPe:
type: radio
minimap2_overlap_self_threads:
tool: minimap2_overlap_self
rule: minimap2_overlap_self
type: numeric
label: Number of threads to use
minimap2_overlap_self_pacbio_oxfordNanopore:
tool: minimap2_overlap_self
rule: minimap2_overlap_self
type: radio
label: Sequencing used to produce reads
minimap2_reference_threads:
tool: minimap2_reference
rule: minimap2_reference
type: numeric
label: Number of threads to use
racon_threads:
tool: racon
rule: racon
type: numeric
label: Number of threads to use
medaka_threads:
tool: medaka
rule: medaka
type: numeric
label: Number of threads to use
medaka_model:
tool: medaka
rule: medaka
type: select
label: Medaka model
prepare_report_scripts: []
prepare_report_outputs: {}
outputs:
minimap2_overlap_self:
minimap2_overlap_self:
- name: reads_overlaps
file: reads.paf.gz
description: Reads overlaps
miniasm:
miniasm:
- name: assembly_gfa
file: reads.gfa
description: Assembly graph file
- name: assembly_fasta
file: reads.fasta
description: Assembly fasta file
minimap2_reference:
minimap2_reference:
- name: reads_mapping
file: reads.paf.gz
description: Reads mapping
racon:
racon:
- name: assembly_corrected
file: assembly_corrected.fasta
description: Assembly corrected
medaka:
medaka:
- name: consensus_assembly
file: consensus_assembly.fasta
description: Consensus assembly file
multiqc:
minimap2_overlap_self: custom
miniasm: custom
minimap2_reference: custom
racon: custom
medaka: custom
Bootstrap: localimage
From: ../base.sif
%environment
export PATH=/opt/biotools/bin:$PATH
export ROOTSYS=/opt/biotools/root
export LD_LIBRARY_PATH='$LD_LIBRARY_PATH:$ROOTSYS/lib'
%labels
Author YourName
Version v0.0.1
build_date 2018 déc. 07
%runscript
echo "This container contains two apps (UI and Snakemake)."
echo "UI is a user interface to set up the workflow and launch it."
echo "Snakemake let you provide your configfile and other parameters to the snakemake command and launch it."
echo "To get help for an app :\nsingularity help --app appName this_container.sif"
echo "To run an app :\nsingularity run --app appName this_container.sif"
%apprun UI
exec Rscript -e "shiny::runApp('/sagApp/app.R',host='$1',port=$2)"
%apphelp UI
To run the UI app you should bind data and results directories like in the following example.
You must also provide the host address and port where the shiny app will be launched
exemple : singularity run --app UI -B /path/to/data/directory:/Data -B /path/to/store/Results:/Results this_container.sif 127.0.0.1 1234
%apprun Snakemake
configfile=$1
cores=$2
shift
shift
exec snakemake -s /workflow/Snakefile all --configfile $configfile --cores $cores $@
%apphelp Snakemake
To run the Snakemake app you should bind data and results directories like in the following example.
You must also provide the configfile and the number of cores provided to snakemake command (you can add other parameters after these two)
exemple : singularity run --app Snakemake -B /path/to/data/directory:/Data -B /path/to/store/Results:/Results this_container.sif myconfig.yml 16 otherparams
%apprun getConfigfile
exec cp /workflow/params.total.yml ./params.yml
%apphelp getConfigfile
To run the getConfigfile app you dont need to bind directories. This app will only copy the default parameters file from the container to your local disk.
exemple : singularity run --app getConfigfile this_container.sif
%apprun getSamples
exec python3 /workflow/get_samples.py $1 $2
%apphelp getSamples
To run the getSamples app you need to bind the data directory. This app will give you the list of samples detected in a given directory and their file suffix.
exemple : singularity run --app getSamples -B /path/to/data/directory:/Data this_container.sif /Data PE
%help