Commit 35bf0bcd authored by mmassaviol's avatar mmassaviol
Browse files

Add RNAseq workflow and update doc

parent f7dc5af5
......@@ -23,6 +23,7 @@ Liste des workflows existants:
- RADseq_ref (stacks)
- RADseq_denovo (stacks)
- Virus_Assembler_Megahit (Megahit + Blast + BWA)
- RNAseq (pseudoalign (kallisto, Salmon) + expression différentielle (Edger, DESeq2))
## 1 Construire un conteneur
......
import os
import re
import csv
import snakemake.utils
dir_path = "/workflow"
SAMPLES = config["samples"]
STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"]
config = config["params"]
##########
# Inputs #
##########
# Generic input functions
## get raw_reads
def raw_reads():
inputs = dict()
if (config["SeOrPe"] == "PE"):
inputs["read"] = config['sample_dir']+'/{sample}_R1'+config["sample_suffix"],
inputs["read2"] = config['sample_dir']+'/{sample}_R2'+config["sample_suffix"],
elif (config["SeOrPe"] == "SE"):
inputs["read"] = config['sample_dir']+'/{sample}'+config["sample_suffix"],
else:
sys.exit("SeOrPe should be SE or PE")
return inputs
## get reads (trimmed or raw)
def reads():
inputs = dict()
if (config["trimming"] == "null"):
return raw_reads()
elif (config["trimming"] == "trimmomatic"):
if (config["SeOrPe"] == "SE"):
inputs["read"] = rules.trimmomatic_SE.output.read
elif (config["SeOrPe"] == "PE"):
inputs["read"] = rules.trimmomatic_PE.output.readFP
inputs["read2"] = rules.trimmomatic_PE.output.readRP
return inputs
def get_groups():
with open(config["group_file"], mode="r") as infile:
reader = csv.reader(infile,delimiter='\t')
return {row[0]:row[1] for row in reader}
config["groups"] = get_groups()
# Tools inputs functions
def fastqc_inputs():
return raw_reads()
def trimmomatic_inputs():
return raw_reads()
def kallisto_inputs():
return reads()
def salmon_inputs():
return reads()
def edger_inputs():
inputs = dict()
if (config["quantification"] == 'kallisto'):
if (config["SeOrPe"] == "PE"):
inputs["counts"] = expand(rules.kallisto_quant_PE.output.counts,sample=SAMPLES)
else:
inputs["counts"] = expand(rules.kallisto_quant_SE.output.counts,sample=SAMPLES)
elif (config["quantification"] == 'salmon'):
if (config["SeOrPe"] == "PE"):
inputs["counts"] = expand(rules.salmon_quant_PE.output.counts,sample=SAMPLES)
else:
inputs["counts"] = expand(rules.salmon_quant_SE.output.counts,sample=SAMPLES)
return inputs
def deseq2_inputs():
return edger_inputs()
def prepare_report_inputs():
inputs = list()
for step in STEPS:
inputs.extend(step_outputs(step["name"]))
if (step["name"] == config["final_step"]):
break
return inputs
def prepare_report_scripts():
scripts = list()
for step in STEPS:
tool = config[step["name"]]
script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script)
if (tool == config["final_step"]):
break
return scripts
def prepare_report_outputs():
outputs = list()
outputs.append(config["results_dir"] + "/outputs_mqc.csv")
for step in STEPS:
tool = config[step["name"]]
if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output)
if (tool == config["final_step"]):
break
return outputs
def multiqc_inputs():
# Need prepare_report inputs and outputs in case prepare_reports has no outputs
return prepare_report_outputs()
###########
# Outputs #
###########
def step_outputs(step):
outputs = list()
if (step == "trimming"):
if (config[step] == "trimmomatic"):
if(config["SeOrPe"] == "PE"):
outputs = expand(rules.trimmomatic_PE.output,sample=SAMPLES)
else:
outputs = expand(rules.trimmomatic_SE.output,sample=SAMPLES)
elif (step == "quality_check"):
if (config[step] == "fastqc"):
if(config["SeOrPe"] == "PE"):
outputs = expand(rules.fastqc_PE.output,sample=SAMPLES)
else:
outputs = expand(rules.fastqc_SE.output,sample=SAMPLES)
elif (step == "quantification"):
if (config[step] == "kallisto"):
if(config["SeOrPe"] == "PE"):
outputs = expand(rules.kallisto_quant_PE.output,sample=SAMPLES)
else:
outputs = expand(rules.kallisto_quant_SE.output,sample=SAMPLES)
elif (config[step] == "salmon"):
if(config["SeOrPe"] == "PE"):
outputs = expand(rules.salmon_quant_PE.output,sample=SAMPLES)
else:
outputs = expand(rules.salmon_quant_SE.output,sample=SAMPLES)
elif (step == "DE"):
if (config[step] == "deseq2"):
outputs = rules.deseq2.output
elif (config[step] == "edger"):
outputs = rules.edger.output
elif (step == "all"):
outputs = list(rules.multiqc.output)
return outputs
# get outputs for each choosen tools
def workflow_outputs(step):
outputs = list()
outputs.extend(step_outputs(step))
return outputs
#########
# Rules #
#########
rule trimmomatic_PE:
input:
**trimmomatic_inputs()
output:
readFP = config["results_dir"]+"/"+config["trimmomatic_PE_output_dir"]+"/{sample}_forward_paired.fq.gz",
readFU = config["results_dir"]+"/"+config["trimmomatic_PE_output_dir"]+"/{sample}_forward_unpaired.fq.gz",
readRP = config["results_dir"]+"/"+config["trimmomatic_PE_output_dir"]+"/{sample}_reverse_paired.fq.gz",
readRU = config["results_dir"]+"/"+config["trimmomatic_PE_output_dir"]+"/{sample}_reverse_unpaired.fq.gz",
log:
config["results_dir"]+'/logs/trimmomatic/{sample}_trimmomatic_log.txt'
params:
trimmomatic_qc_score = config["trimmomatic_qc_score"]
threads:
config["trimmomatic_threads"]
shell:
"trimmomatic PE "
"{params.trimmomatic_qc_score} "
"-threads {threads} "
"{input} "
"{output} "
"|& tee {log}"
rule trimmomatic_SE:
input:
**trimmomatic_inputs()
output:
read = config["results_dir"]+"/"+config["trimmomatic_SE_output_dir"]+"/{sample}_trimmed.fq.gz",
log:
config["results_dir"]+'/logs/trimmomatic/{sample}_trimmomatic_log.txt'
params:
trimmomatic_qc_score = config["trimmomatic_qc_score"]
threads:
config["trimmomatic_threads"]
shell:
"trimmomatic SE "
"{params.trimmomatic_qc_score} "
"-threads {threads} "
"{input} "
"{output} "
"|& tee {log}"
ruleorder: trimmomatic_PE > trimmomatic_SE
rule fastqc_SE:
input:
**fastqc_inputs()
output:
html = config["results_dir"]+'/'+config["fastqc_SE_output_dir"]+'/{sample}'+config["sample_suffix"].split(".")[0]+'_fastqc.html',
zip = config["results_dir"]+'/'+config["fastqc_SE_output_dir"]+'/{sample}'+config["sample_suffix"].split(".")[0]+'_fastqc.zip',
log: config["results_dir"]+'/logs/fastqc/{sample}_fastqc_log.txt'
threads: 1
params:
output_dir = config["results_dir"]+'/'+config["fastqc_SE_output_dir"]
shell:
"fastqc {input.read} -t {threads} -o {params.output_dir} |& tee {log}"
rule fastqc_PE:
input:
**fastqc_inputs()
output:
html1 = config["results_dir"]+'/'+config["fastqc_PE_output_dir"]+'/{sample}_R1'+config["sample_suffix"].split(".")[0]+'_fastqc.html',
zip1 = config["results_dir"]+'/'+config["fastqc_PE_output_dir"]+'/{sample}_R1'+config["sample_suffix"].split(".")[0]+'_fastqc.zip',
html2 = config["results_dir"]+'/'+config["fastqc_PE_output_dir"]+'/{sample}_R2'+config["sample_suffix"].split(".")[0]+'_fastqc.html',
zip2 = config["results_dir"]+'/'+config["fastqc_PE_output_dir"]+'/{sample}_R2'+config["sample_suffix"].split(".")[0]+'_fastqc.zip',
log: config["results_dir"]+'/logs/fastqc/{sample}_fastqc_log.txt'
threads: 2
params:
output_dir = config["results_dir"]+'/'+config["fastqc_PE_output_dir"]
shell:
"fastqc {input.read} {input.read2} -t {threads} -o {params.output_dir} |& tee {log}"
ruleorder: fastqc_PE > fastqc_SE
rule kallisto_index:
input:
fasta = config["kallisto_index_input"]
output:
index = config["results_dir"]+"/"+config["kallisto_index_output_dir"]+"/index.kidx"
log:
config["results_dir"]+'/logs/kallisto/index/kallisto_index_log.txt'
shell:
"kallisto index -i {output} {input} |& tee {log}"
rule kallisto_quant_SE:
input:
**kallisto_inputs(),
index = config["results_dir"]+"/"+config["kallisto_index_output_dir"]+"/index.kidx"
output:
h5 = config["results_dir"]+"/"+config["kallisto_quant_SE_output_dir"]+"/{sample}/abundance.h5",
counts = config["results_dir"]+"/"+config["kallisto_quant_SE_output_dir"]+"/{sample}/abundance.tsv",
run_info = config["results_dir"]+"/"+config["kallisto_quant_SE_output_dir"]+"/{sample}/run_info.json"
log:
config["results_dir"]+'/logs/kallisto/quant/{sample}_kallisto_quant_log.txt'
threads:
config["kallisto_quant_threads"]
params:
fragment_length = config["kallisto_quant_fragment_length_SE"],
standard_deviation = config["kallisto_quant_standard_deviation_SE"],
output_dir = config["results_dir"]+"/"+config["kallisto_quant_SE_output_dir"]+"/{sample}"
shell:
"kallisto quant "
"-t {threads} "
"-i {input.index} "
"-o {params.output_dir} "
"--single "
"-l {params.fragment_length} "
"-s {params.standard_deviation} "
"{input.read} |& tee {log}"
rule kallisto_quant_PE:
input:
**kallisto_inputs(),
index = config["results_dir"]+"/"+config["kallisto_index_output_dir"]+"/index.kidx"
output:
h5 = config["results_dir"]+"/"+config["kallisto_quant_PE_output_dir"]+"/{sample}/abundance.h5",
counts = config["results_dir"]+"/"+config["kallisto_quant_PE_output_dir"]+"/{sample}/abundance.tsv",
run_info = config["results_dir"]+"/"+config["kallisto_quant_PE_output_dir"]+"/{sample}/run_info.json"
log:
config["results_dir"]+'/logs/kallisto/quant/{sample}_kallisto_quant_log.txt'
threads:
config["kallisto_quant_threads"]
params:
stranded = config["kallisto_quant_stranded_PE"],
output_dir = config["results_dir"]+"/"+config["kallisto_quant_PE_output_dir"]+"/{sample}"
shell:
"kallisto quant "
"-t {threads} "
"-i {input.index} "
"-o {params.output_dir} "
"{input.read} {input.read2} |& tee {log}"
ruleorder: kallisto_quant_PE > kallisto_quant_SE
rule salmon_index:
input: config["salmon_index_input"]
output: directory(config["results_dir"]+"/"+config["salmon_index_output_dir"]+"/indexDir")
log: config["results_dir"]+'/logs/salmon/index/salmon_index_log.txt'
shell:
"salmon index -t {input} -i {output} |& tee {log}"
rule salmon_quant_PE:
input:
**salmon_inputs(),
index = config["results_dir"]+"/"+config["salmon_index_output_dir"]+"/indexDir"
output:
counts = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/quant.sf",
lib_counts = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/lib_format_counts.json",
run_info = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/cmd_info.json"
log: config["results_dir"]+'/logs/salmon/quant/{sample}_salmon_quant_log.txt'
params:
output_dir = config["results_dir"]+"/"+config["salmon_quant_PE_output_dir"]+'/{sample}'
threads: config["salmon_quant_threads"]
shell:
"salmon quant "
"-i {input.index} "
"-l A "
"-1 {input.read} "
"-2 {input.read2} "
"-o {params.output_dir} "
"-p {threads} |& tee {log}"
rule salmon_quant_SE:
input:
**salmon_inputs(),
index = config["results_dir"]+"/"+config["salmon_index_output_dir"]+"/indexDir"
output:
counts = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/quant.sf",
lib_counts = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/lib_format_counts.json",
run_info = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+"/{sample}/cmd_info.json"
log:
config["results_dir"]+'/logs/salmon/quant/{sample}_salmon_quant_log.txt'
params:
output_dir = config["results_dir"]+"/"+config["salmon_quant_SE_output_dir"]+'/{sample}'
threads:
config["salmon_quant_threads"]
shell:
"salmon quant "
"-i {input.index} "
"-l A "
"-r {input.read} "
"-o {params.output_dir} "
"-p {threads} |& tee {log}"
ruleorder: salmon_quant_PE > salmon_quant_SE
rule edger:
input:
**edger_inputs()
output:
de_table = config["results_dir"]+'/'+config["edger_output_dir"]+'/de_table.csv',
r_data = config["results_dir"]+'/'+config["edger_output_dir"]+'/data.RData',
PCA = config["results_dir"]+'/'+config["edger_output_dir"]+'/PCA_mqc.png',
Top_genes = config["results_dir"]+'/'+config["edger_output_dir"]+'/Top_genes_mqc.tsv',
Heatmap = config["results_dir"]+'/'+config["edger_output_dir"]+'/Heatmap_mqc.png',
log: config["results_dir"]+'/logs/edger/edger_log.txt'
script:
"scripts/edger.script.R"
rule deseq2:
input:
**deseq2_inputs()
output:
de_table = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/de_table.csv',
r_data = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/data.RData',
PCA = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/PCA_mqc.png',
DE_Table = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/DE_Table_mqc.tsv',
MA_plot = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/MA_plot_mqc.png',
Heatmap = config["results_dir"]+'/'+config["deseq2_output_dir"]+'/Heatmap_mqc.png',
log: config["results_dir"]+'/logs/deseq2/deseq2_log.txt'
script:
"scripts/deseq2.script.R"
rule prepare_report:
input:
*prepare_report_inputs(),
output:
*prepare_report_outputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml",
params_tab = config["results_dir"] + "/params_tab_mqc.csv"
params:
params_file = config["results_dir"]+"/params.yml",
results_dir = config["results_dir"]
log:
config["results_dir"]+"/logs/prepare_report_log.txt"
run:
# Specific scripts for each tool
for script in prepare_report_scripts():
shell("Rscript "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
outfile = config["results_dir"] + "/outputs_mqc.csv"
head = """
# description: 'This is the list of the files generated by each step of the workflow'
# section_name: 'Workflow outputs'
"""
with open(outfile,"w") as out:
out.write(head)
out.write("step\ttool\tfile\tdescription\n")#\tname
for step in STEPS:
tool = config[step["name"]]
i=1
for command in OUTPUTS[tool]:
if ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
outputs = OUTPUTS[tool][command]
for files in outputs:
path = config[command+"_output_dir"] + "/" + files["file"] #config["results_dir"] +"/"+
out.write(str(i)+"-"+step["title"]+"\t"+tool+"\t"+path+"\t"+files["description"]+"\n")#"\t"+files["name"]+
i+=1
# Params list for Multiqc report
params_list = "params_name\tvalue\n"
head = """
# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters'
"""
for step in STEPS:
tool = config[step["name"]]
for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]):
params_list += key + "\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out:
out.write(head)
out.write(params_list)
# Config for Multiqc report
shell("python3 /workflow/generate_multiqc_config.py {params.params_file} {output.config_multiqc}")
rule multiqc:
input:
multiqc_inputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml"
output:
multiqc_dir = directory(config["results_dir"]+"/multiqc_data")
params:
output_dir = config["results_dir"]
log:
config["results_dir"]+'/logs/multiqc/multiqc_log.txt'
shell:
"multiqc --config {input.config_multiqc} -f {params.output_dir} "
"-o {params.output_dir} |& tee {log}"
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all:
input:
workflow_outputs(config["final_step"])
output:
Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py",
scripts = directory(config["results_dir"]+"/workflow/scripts"),
params = config["results_dir"]+"/workflow/params.yml"
params:
params_file = config["results_dir"]+"/params.yml",
shell:
"cp /workflow/Snakefile {output.Snakefile} && "
"cp /workflow/get_samples.py {output.get_samples} && "
"cp -r /workflow/scripts {output.scripts} && "
"cp {params.params_file} {output.params}"
onsuccess:
print("Workflow finished, no error")
shell("touch "+config["results_dir"]+"/logs/workflow_end.ok")
onerror:
print("An error occurred")
shell("cat {log} > "+config["results_dir"]+"/logs/workflow_end.error")
#shell("mail -s "an error occurred" youremail@provider.com < {log}")
import re
import sys
from tools import read_yaml
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
import os
import re
import csv
import sys
def sample_list(dir, filesuffix=".fastq.gz"):
samples = list()
files = os.listdir(dir)
regex = re.compile("^(.+?)(_R1|_R2|)("+filesuffix+")")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
return sorted(samples)
def sample_list2(dir, SeOrPe):
samples = list()
suffixes = list()
files = os.listdir(dir)
if SeOrPe == "PE":
regex = re.compile("^(.+?)(_R1|_R2)(.+)")
else:
regex = re.compile("^(.+?)(\..*)")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
if SeOrPe == "PE":
suffixes.append(res.group(3))
else:
suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ):
return {'samples': samples, 'suffix': set(suffixes)}
else:
pass
def group_list(dir, groupfile, filesuffix=".fastq.gz"):
samples = sample_list(dir,filesuffix)
if os.path.isfile(groupfile):
groups = dict()
with open(groupfile, mode="r") as infile:
reader = csv.reader(infile)
groups = {row[0]: row[1] for row in reader}
return {sample: groups[sample] for sample in samples}
else:
return {sample: "group" for sample in samples}
#print(sample_list2(sys.argv[1],sys.argv[2]))
if len(sys.argv) == 4:
print(group_list(sys.argv[1], sys.argv[2], sys.argv[3]))
else:
print(group_list(sys.argv[1], sys.argv[2]))
pipeline: RNAseq
params:
results_dir: /Results
sample_dir: /Data
sample_suffix: .fastq.gz
group_file_select: server
group_file: ''
SeOrPe: PE
trimming: 'null'
trimmomatic_PE_output_dir: trimmomatic PE
trimmomatic_threads: 4
trimmomatic_qc_score: -phred64
trimmomatic_SE_output_dir: trimmomatic
null_output_dir: ''
quality_check: fastqc
fastqc_SE_output_dir: fastqc_SE
fastqc_PE_output_dir: fastqc_PE
fastqc_threads: 4
quantification: kallisto
kallisto_index_output_dir: kallisto/index
kallisto_index_input: ''
kallisto_index_input_select: server
kallisto_quant_SE_output_dir: kallisto/quant
kallisto_quant_index: kallisto/index/index.kidx
kallisto_quant_threads: 4
kallisto_quant_fragment_length_SE: 300
kallisto_quant_standard_deviation_SE: 2
kallisto_quant_PE_output_dir: kallisto/quant
kallisto_quant_stranded_PE: ''
salmon_index_output_dir: salmon/index
salmon_index_input_select: server
salmon_index_input: ''
salmon_index_threads: 4
salmon_quant_SE_output_dir: salmon/quant
index: salmon/index/indexDir
salmon_quant_threads: 4
salmon_quant_PE_output_dir: salmon/quant
DE: edger
edger_output_dir: edger
edger_threads: 4