Commit 04c57930 authored by mmassaviol's avatar mmassaviol
Browse files

Add workflow Genome_Profile

parent 8bbabb2b
......@@ -31,6 +31,7 @@ Liste des workflows existants:
- RNAseq (pseudoalign (kallisto, Salmon) + expression différentielle (Edger, DESeq2))
- Variant_calling (bwa or bowtie and gatk or bcftools)
- Interop_report (Illumina InterOp)
- Genome_Profile (Jellyfish + GenomeScope)
## 1 Construire un conteneur
......
import os
import re
import snakemake.utils
import csv
#############
# Wildcards #
#############
SAMPLES = config["samples"]
STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"]
PARAMS_INFO = config["params_info"]
config = config["params"]
##########
# Inputs #
##########
# Generic input functions
## get raw_reads
def raw_reads():
inputs = dict()
if (config["SeOrPe"] == "PE"):
inputs["read"] = config['sample_dir']+'/{sample}_R1'+config["sample_suffix"]
inputs["read2"] = config['sample_dir']+'/{sample}_R2'+config["sample_suffix"]
elif (config["SeOrPe"] == "SE"):
inputs["read"] = config['sample_dir']+'/{sample}'+config["sample_suffix"]
else:
sys.exit("SeOrPe should be SE or PE")
return inputs
## get reads (trimmed or raw)
def reads():
return raw_reads()
# Tools inputs functions
def fastp_inputs():
return raw_reads()
def jellyfish_count_inputs():
inputs = dict()
if (config["preprocessing"] == "fastp"):
if (config["SeOrPe"] == "PE"):
inputs["read"] = expand(rules.fastp_PE.output.R1,sample=SAMPLES)
inputs["read2"] = expand(rules.fastp_PE.output.R2,sample=SAMPLES)
else:
inputs["read"] = expand(rules.fastp_SE.output.read,sample=SAMPLES)
else:
inputs["read"] = expand(raw_reads()["read"],sample=SAMPLES)
if (config["SeOrPe"] == "PE"):
inputs["read2"] = expand(raw_reads()["read2"],sample=SAMPLES)
return inputs
def genomescope_inputs():
inputs = dict()
inputs["kmer_histo"] = rules.jellyfish_histo.output.kmer_histo
return inputs
def prepare_report_inputs():
inputs = list()
for step in STEPS:
inputs.extend(step_outputs(step["name"]))
return inputs
def prepare_report_scripts():
scripts = list()
for step in STEPS:
tool = config[step["name"]]
script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script)
return scripts
def prepare_report_outputs():
outputs = list()
outputs.append(config["results_dir"] + "/outputs_mqc.csv")
for step in STEPS:
tool = config[step["name"]]
if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output)
return outputs
def multiqc_inputs():
# Need prepare_report inputs and outputs in case prepare_reports has no outputs
return prepare_report_outputs()
###########
# Outputs #
###########
def step_outputs(step):
outputs = list()
if (step == "kmer_counting"):
if (config[step] == "jellyfish"):
outputs = rules.jellyfish_histo.output
if (step == "kmer_analysis"):
if (config[step] == "genomescope"):
outputs = rules.genomescope.output
elif (step == "all"):
outputs = list(rules.multiqc.output)
return outputs
# get outputs for each choosen tools
def workflow_outputs(step):
outputs = list()
outputs.extend(step_outputs(step))
return outputs
#########
# Rules #
#########
rule fastp_PE:
input:
**fastp_inputs()
output:
report_html = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/fastp_report_{sample}.html",
report_json = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/fastp_report_{sample}.json",
R1 = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/{sample}_R1.fq.gz",
R2 = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/{sample}_R2.fq.gz"
params:
complexity_threshold = config["fastp_complexity_threshold"],
report_title = config["fastp_report_title"],
adapter_sequence = config["fastp_adapter_sequence"],
adapter_sequence_R2 = config["fastp_adapter_sequence_R2_PE"],
P = config["fastp_P"],
output_dir = config["fastp_PE_output_dir"],
correction = "--correction " if config["fastp_correction_PE"] == True else "",
low_complexity_filter = "--low_complexity_filter " if config["fastp_low_complexity_filter"] == True else "",
overrepresentation_analysis = "--overrepresentation_analysis " if config["fastp_overrepresentation_analysis"] == True else "",
log:
config["results_dir"]+"/logs/fastp/{sample}_fastp_log.txt"
threads:
config["fastp_threads"]
shell:
"fastp "
"-i {input.read} "
"-I {input.read2} "
"-o {output.R1} "
"-O {output.R2} "
"-w {threads} "
"{params.correction} "
"{params.low_complexity_filter} "
"--complexity_threshold {params.complexity_threshold} "
"--html {output.report_html} "
"--json {output.report_json} "
"--report_title {params.report_title} "
"--adapter_sequence '{params.adapter_sequence}' "
"--adapter_sequence_r2 '{params.adapter_sequence_R2}' "
"{params.overrepresentation_analysis} "
"-P {params.P} "
"|& tee {log}"
rule fastp_SE:
input:
**fastp_inputs()
output:
report_html = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/fastp_report_{sample}.html",
report_json = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/fastp_report_{sample}.json",
read = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/{sample}.fq.gz",
params:
complexity_threshold = config["fastp_complexity_threshold"],
report_title = config["fastp_report_title"],
adapter_sequence = config["fastp_adapter_sequence"],
P = config["fastp_P"],
output_dir = config["fastp_SE_output_dir"],
low_complexity_filter = "--low_complexity_filter " if config["fastp_low_complexity_filter"] == True else "",
overrepresentation_analysis = "--overrepresentation_analysis " if config["fastp_overrepresentation_analysis"] == True else "",
log:
config["results_dir"]+"/logs/fastp/{sample}_fastp_log.txt"
threads:
config["fastp_threads"]
shell:
"fastp "
"-i {input.read} "
"-o {output.R1} "
"-w {threads} "
"{params.low_complexity_filter} "
"--complexity_threshold {params.complexity_threshold} "
"--html {output.report_html} "
"--json {output.report_json} "
"--report_title {params.report_title} "
"--adapter_sequence '{params.adapter_sequence}' "
"{params.overrepresentation_analysis} "
"-P {params.P} "
"|& tee {log}"
rule jellyfish_count:
input:
**jellyfish_count_inputs(),
output:
kmer_counts = config["results_dir"] + "/" + config["jellyfish_count_output_dir"] + "/counts.jf",
params:
canonical_kmer = "-C" if config["jellyfish_count_canonical_kmer"] else "",
kmer_len = config["jellyfish_count_kmer_len"],
hash_size = config["jellyfish_count_hash_size"]
threads:
config["jellyfish_threads"]
log:
config["results_dir"]+'/logs/jellyfish/jellyfish_count_log.txt'
run:
inputs = jellyfish_count_inputs()
files = ""
if (config["SeOrPe"] == "PE"):
for r1,r2 in zip(inputs["read"],inputs["read2"]):
files += "<(zcat "+r1+") <(zcat "+r2+") "
else:
for r in inputs["read"]:
files += "<(zcat "+r+") "
shell(
"jellyfish count "+
"{params.canonical_kmer} "+
"-m {params.kmer_len} "+
"-s {params.hash_size} "+
"-t {threads} "+
"-o {output.kmer_counts} "+
files+
"|& tee {log}"
)
rule jellyfish_histo:
input:
kmer_counts = rules.jellyfish_count.output.kmer_counts,
output:
kmer_histo = config["results_dir"] + "/" + config["jellyfish_histo_output_dir"] + "/kmer_histo_jf.hist",
threads:
config["jellyfish_threads"]
shell:
"jellyfish histo "
"-t {threads} "
"{input.kmer_counts} > {output.kmer_histo} "
rule genomescope:
input:
**genomescope_inputs(),
output:
GenomeScope_Profile = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Profile_mqc.png",
GenomeScope_Profile_log_scale = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Profile_log_scale_mqc.png",
Summary = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Summary_mqc.csv"
params:
output_dir = config["results_dir"] + "/" + config["genomescope_output_dir"],
kmer_len = config["genomescope_kmer_len"],
reads_len = config["genomescope_reads_len"]
log:
config["results_dir"]+'/logs/genomescope/genomescope_log.txt'
shell:
"Rscript /opt/biotools/genomescope.R "
"{input.kmer_histo} "
"{params.kmer_len} " # kmer length
"{params.reads_len} " # reads length
"{params.output_dir} "
"|& tee {log};"
# prepare mqc custom content
"mv {params.output_dir}/plot.png {params.output_dir}/GenomeScope_Profile_mqc.png && "
"mv {params.output_dir}/plot.log.png {params.output_dir}/GenomeScope_Profile_log_scale_mqc.png && "
"tail -n +4 {params.output_dir}/summary.txt | sed 's/ \{{2,\}}/\t/g' | sed 's/\t$//g' > {params.output_dir}/GenomeScope_Summary_mqc.csv"
rule prepare_report:
input:
*prepare_report_inputs(),
output:
*prepare_report_outputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml",
params_tab = config["results_dir"] + "/params_tab_mqc.csv"
params:
params_file = config["results_dir"]+"/params.yml",
results_dir = config["results_dir"]
log:
config["results_dir"]+"/logs/prepare_report_log.txt"
run:
# Specific scripts for each tool
for script in prepare_report_scripts():
shell("Rscript "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
outfile = config["results_dir"] + "/outputs_mqc.csv"
head = """
# description: 'This is the list of the files generated by each step of the workflow'
# section_name: 'Workflow outputs'
"""
with open(outfile,"w") as out:
out.write(head)
out.write("step\ttool\tfile\tdescription\n")#\tname
for step in STEPS:
tool = config[step["name"]]
i=1
for command in OUTPUTS[tool]:
if ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
outputs = OUTPUTS[tool][command]
for files in outputs:
path = config[command+"_output_dir"] + "/" + files["file"] #config["results_dir"] +"/"+
out.write(str(i)+"-"+step["title"]+"\t"+tool+"\t"+path+"\t"+files["description"]+"\n")#"\t"+files["name"]+
i+=1
# Params list for Multiqc report
params_list = "params_name\tdescription\tvalue\n"
head = """# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters'
"""
for step in STEPS:
tool = config[step["name"]]
for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]) and ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
if (key in PARAMS_INFO.keys() and "label" in PARAMS_INFO[key].keys()):
description = PARAMS_INFO[key]["label"]
else:
description = ''
params_list += key + "\t'" + description + "'\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out:
out.write(head)
out.write(params_list)
# Config for Multiqc report
shell("python3 /workflow/generate_multiqc_config.py {params.params_file} {output.config_multiqc}")
rule multiqc:
input:
multiqc_inputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml"
output:
multiqc_dir = directory(config["results_dir"]+"/multiqc_data")
params:
output_dir = config["results_dir"]
log:
config["results_dir"]+'/logs/multiqc/multiqc_log.txt'
shell:
"multiqc --config {input.config_multiqc} -f {params.output_dir} "
"-o {params.output_dir} |& tee {log}"
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all:
input:
workflow_outputs("all")
output:
Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py",
scripts = directory(config["results_dir"]+"/workflow/scripts"),
params = config["results_dir"]+"/workflow/params.yml"
params:
params_file = config["results_dir"]+"/params.yml",
shell:
"cp /workflow/Snakefile {output.Snakefile} && "
"cp /workflow/get_samples.py {output.get_samples} && "
"cp -r /workflow/scripts {output.scripts} && "
"cp {params.params_file} {output.params}"
onsuccess:
print("Workflow finished, no error")
shell("touch "+config["results_dir"]+"/logs/workflow_end.ok")
onerror:
print("An error occurred")
shell("cat {log} > "+config["results_dir"]+"/logs/workflow_end.error")
#shell("mail -s "an error occurred" youremail@provider.com < {log}")
import re
import sys
from tools import *
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
#!/usr/bin/env python3
# This script will take a directory and a parameter to tell if the reads are paired end or single end and return the sample list and the suffix
# Needs 2 arguments: reads_directory, SeOrPe
# SeOrPe is SE for single end reads and PE for paired end reads
# Usage: ./get_samples.py reads_directory SeOrPe
import os
import re
import csv
import sys
def sample_list(dir, SeOrPe):
samples = list()
suffixes = list()
files = os.listdir(dir)
if SeOrPe == "PE":
regex = re.compile(r"^(.+?)(_R1|_R2)(.+)")
else:
regex = re.compile(r"^(.+?)(\..*)")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
if SeOrPe == "PE":
suffixes.append(res.group(3))
else:
suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ):
return {'samples': sorted(samples), 'suffix': list(set(suffixes))[0]}
else:
exit("Files have different suffixes:" + ','.join(suffixes))
def main():
if len(sys.argv) == 3:
print(sample_list(sys.argv[1],sys.argv[2]))
else:
exit("""Needs 2 arguments: reads_directory, SeOrPe
Usage: ./get_samples.py reads_directory SeOrPe""")
if __name__ == "__main__":
# execute only if run as a script
main()
pipeline: Genome_Profile
params:
results_dir: /Results
sample_dir: /Data
SeOrPe: PE
preprocessing: 'null'
fastp_PE_output_dir: fastp_PE
fastp_threads: 4
fastp_complexity_threshold: 30
fastp_report_title: fastp report
fastp_adapter_sequence: ''
fastp_adapter_sequence_R2_PE: ''
fastp_P: 20
fastp_correction_PE: true
fastp_low_complexity_filter: true
fastp_overrepresentation_analysis: true
fastp_SE_output_dir: fastp_SE
null_output_dir: ''
kmer_counting: jellyfish
jellyfish_count_output_dir: jellyfish
reads: ''
jellyfish_threads: 4
jellyfish_count_canonical_kmer: true
jellyfish_count_kmer_len: 21
jellyfish_count_hash_size: 100000000
jellyfish_histo_output_dir: jellyfish
kmer_counts: counts.jf
kmer_analysis: genomescope
genomescope_output_dir: genomescope
kmer_histo: kmer_histo_jf.hist
genomescope_kmer_len: 21
genomescope_reads_len: 300
samples: []
groups: []
steps:
- title: Preprocessing
name: preprocessing
tools:
- fastp
- 'null'
default: 'null'
- title: K-mer counting
name: kmer_counting
tools:
- jellyfish
default: jellyfish
- title: K-mer analysis
name: kmer_analysis
tools:
- genomescope
default: genomescope
params_info:
results_dir:
type: output_dir
sample_dir:
type: input_dir
SeOrPe:
type: radio
fastp_threads:
tool: fastp
rule: fastp_SE
type: numeric
label: Number of threads to use
fastp_complexity_threshold:
tool: fastp
rule: fastp_SE
type: numeric
label: The threshold for low complexity filter (0~100)
fastp_report_title:
tool: fastp
rule: fastp_SE
type: text
label: fastp report title
fastp_adapter_sequence:
tool: fastp
rule: fastp_SE
type: text
label: The adapter for read1. For SE data, if not specified, the adapter will
be auto-detected. For PE data, this is used if R1/R2 are found not overlapped.
fastp_adapter_sequence_R2_PE:
tool: fastp
rule: fastp_PE
type: text
label: the adapter for read2 (PE data only). This is used if R1/R2 are found not
overlapped. If not specified, it will be the same as <adapter_sequence>
fastp_P:
tool: fastp
rule: fastp_SE
type: numeric
label: One in (--overrepresentation_sampling) reads will be computed for overrepresentation
analysis (1~10000), smaller is slower.
fastp_correction_PE:
tool: fastp
rule: fastp_PE
type: checkbox
label: Enable base correction in overlapped regions
fastp_low_complexity_filter:
tool: fastp
rule: fastp_SE
type: checkbox
label: Enable low complexity filter. The complexity is defined as the percentage
of base that is different from its next base (base[i] != base[i+1]).
fastp_overrepresentation_analysis:
tool: fastp
rule: fastp_SE
type: checkbox
label: enable overrepresented sequence analysis.
jellyfish_threads:
tool: jellyfish
rule: jellyfish_histo
type: numeric
label: Number of threads to use
jellyfish_count_canonical_kmer:
tool: jellyfish
rule: jellyfish_count
type: checkbox
label: '-C : Save in the hash only canonical k-mers, while the count is the number
of occurrences of both a k-mer and it reverse complement'
jellyfish_count_kmer_len:
tool: jellyfish
rule: jellyfish_count
type: numeric
label: K-mers length
jellyfish_count_hash_size:
tool: jellyfish
rule: jellyfish_count
type: numeric
label: Kmer size estimation
genomescope_kmer_len:
tool: genomescope
rule: genomescope
type: numeric
label: K-mers length
genomescope_reads_len:
tool: genomescope
rule: genomescope
type: numeric
label: Read length
prepare_report_scripts: []
prepare_report_outputs: {}
outputs:
fastp:
fastp_PE:
- name: report_html
file: fastp_report_{sample}.html
description: "Rapport HTML du pr\xE9processing effectu\xE9"
- name: report_json
file: fastp_report_{sample}.json
description: "Rapport JSON du pr\xE9processing effectu\xE9"
- name: read
file: '{sample}_R1.fq.gz'
description: "Reads R1 pr\xE9process\xE9s"
- name: read2
file: '{sample}_R2.fq.gz'
description: "Reads R2 pr\xE9process\xE9s"
fastp_SE: