Commit 5fcbbec2 authored by khalid's avatar khalid
Browse files

Delete workflows directories

parent 5192ea01
FROM mmassaviol/mbb_workflows_base:latest
COPY files /workflow
COPY sagApp /sagApp
RUN wget http://opengene.org/fastp/fastp \
&& chmod a+x ./fastp \
&& mv fastp /opt/biotools/bin/fastp
RUN cd /opt/biotools/bin \
&& wget -O jellyfish https://github.com/gmarcais/Jellyfish/releases/download/v2.3.0/jellyfish-linux \
&& chmod +x /opt/biotools/bin/jellyfish
RUN cd /opt/biotools \
&& wget -O genomescope.tar.gz https://github.com/schatzlab/genomescope/archive/v1.0.0.tar.gz \
&& tar -xvzf genomescope.tar.gz \
&& mv genomescope-1.0.0/genomescope.R . \
&& rm -r genomescope-1.0.0 genomescope.tar.gz
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
RUN mkdir -p /share/apps/bin \
&& mkdir /share/apps/lib \
&& mkdir /share/apps/gridengine \
&& mkdir /share/bio \
&& mkdir -p /opt/gridengine \
&& mkdir -p /export/scrach \
&& mkdir -p /usr/lib64 \
&& ln -s /bin/bash /bin/mbb_bash \
&& ln -s /bin/bash /bin/isem_bash \
&& /usr/sbin/groupadd --system --gid 400 sge \
&& /usr/sbin/useradd --system --uid 400 --gid 400 -c GridEngine --shell /bin/true --home /opt/gridengine sge
EXPOSE 3838
CMD ["Rscript", "-e", "setwd('/sagApp/'); shiny::runApp('/sagApp/app.R',port=3838 , host='0.0.0.0')"]
#!/bin/bash
# This script is executed on the virtual machine during the *Deployment* phase.
# It is used to apply parameters specific to the current deployment.
# It is executed secondly during a cloud deployement in IFB-Biosphere, after the *Installation* phase.
if [ $# -lt 1 ]
then
APP_IMG="mbbteam/genome_profile:latest"
else
IMG_SRC=$1
case $IMG_SRC in
ifb)
APP_IMG="gitlab-registry.in2p3.fr/ifb-biosphere/apps/genome_profile:master" ;;
docker )
APP_IMG="mbbteam/genome_profile:latest" ;;
local)
docker build . -t genome_profile:latest
APP_IMG="genome_profile:latest" ;;
mbb)
#APP_IMG="X.X.X.X:5000/genome_profile:latest" ;;
esac
fi
# Tuning if site proxy or not
#CLOUD_SERVICE = $(ss-get cloudservice)
#CLOUD_SERVICE="ifb-genouest-genostack"
#HOST_NAME=$( ss-get --timeout=3 hostname )
HOST_NAME="192.168.100.49"
#if [ "$CLOUD_SERVICE" == "ifb-genouest-genostack" ]; then
# Cloud site WITH a site proxy
# APP_PORT=80
# PROXIED_IP=$( echo $HOST_NAME | sed "s|\.|-|g")
# HOST_NAME="openstack-${PROXIED_IP}.genouest.org"
# HTTP_ENDP="https://$HOST_NAME"
# systemctl stop nginx
#else
# Cloud site WOUT a site proxy
APP_PORT=8787
HTTP_ENDP="https://$HOST_NAME"
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/ssl/private/nginx-selfsigned.key -out /etc/ssl/certs/nginx-selfsigned.crt -subj "/C=FR/ST=AURA/L=Lyon/O=IFB/OU=IFB-biosphere/CN=myrstudio.biosphere.france-bioinformatique.fr"
openssl dhparam -out /etc/ssl/certs/dhparam.pem 2048
mkdir -p /etc/nginx/snippets
echo "ssl_certificate /etc/ssl/certs/nginx-selfsigned.crt;" > /etc/nginx/snippets/self-signed.conf
echo "ssl_certificate_key /etc/ssl/private/nginx-selfsigned.key;" >> /etc/nginx/snippets/self-signed.conf
cp system/nginx_snippets_ssl-params.conf /etc/nginx/snippets/ssl-params.conf
cp /etc/nginx/sites-available/default /etc/nginx/sites-available/default.bak
cp system/nginx_sites-available_default /etc/nginx/sites-available/default
sed -i "s|server_domain_or_IP|$HOST_NAME|" /etc/nginx/sites-available/default
useradd nginx
cp system/nginx_nginx.conf /etc/nginx/nginx.conf
cp system/nginx_conf.d_10-rstudio.conf /etc/nginx/conf.d/10-rstudio.conf
sed -i "s|example.com|$HOST_NAME|" /etc/nginx/conf.d/10-rstudio.conf
systemctl restart nginx
systemctl enable nginx
#fi
# Docker volumes
# mydatalocal: from the system disk or ephemeral one
IFB_DATADIR="/ifb/data/"
source /etc/profile.d/ifb.sh
VOL_NAME="mydatalocal"
VOL_DEV=$(readlink -f -n $IFB_DATADIR/$VOL_NAME )
DOCK_VOL=" --mount type=bind,src=$VOL_DEV,dst=$IFB_DATADIR/$VOL_NAME"
# MBB Workflows reads data from /Data and write results to /Results
mkdir ${VOL_DEV}/Data
mkdir ${VOL_DEV}/Results
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Data,dst=/Data"
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Results,dst=/Results"
# NFS mounts: from ifb_share configuration in autofs
IFS_ORI=$IFS
while IFS=" :" read VOL_NAME VOL_TYPE VOL_IP VOL_DEV ; do
DOCK_VOL+=" --mount type=volume,volume-driver=local,volume-opt=type=nfs,src=$VOL_NAME,dst=$IFB_DATADIR/$VOL_NAME,volume-opt=device=:$VOL_DEV,volume-opt=o=addr=$VOL_IP"
done < /etc/auto.ifb_share
IFS=$IFS_ORI
CONTAINER_ID=$( docker run -d -p $APP_PORT:3838 $DOCK_VOL $APP_IMG )
VM_IP=$(curl bot.whatismyipaddress.com)
if [ $CONTAINER_ID ]
then
echo " "
echo You have to put your Data on : ${VOL_DEV}/Data
echo " "
echo Results will be written to : ${VOL_DEV}/Results
echo " "
echo You can access the workflow interface at : https://${VM_IP}
echo " "
echo To start a Bash session inside the container : docker exec -it $CONTAINER_ID /bin/bash
echo " "
echo To run the workflow without the interface : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile config --cores XX
echo " "
echo config est un fichier de configuration qui doit être dans un sous dossier de ${VOL_DEV}/Data ou ${VOL_DEV}/Results
echo " "
echo ex. si fichier dans ${VOL_DEV}/Data/run1/maconfig1.yml : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile /Data/run1/maconfig1.yml --cores XX
echo " "
echo Vous pouvez utiliser l''interface graphique pour générer un fichier de configuration.
echo " "
echo XX étant le nombre de coeurs qui seront utilisés par le workflow.
else
echo Failed to run the docker container !!
fi
\ No newline at end of file
#!/bin/bash
#This script will help a deployment of a docker image on an MBB bigmem machine
if [ $# -lt 1 ]
then
APP_IMG="mbbteam/genome_profile:latest"
else
IMG_SRC=$1
case $IMG_SRC in
docker )
APP_IMG="mbbteam/genome_profile:latest" ;;
local)
docker build . -t genome_profile:latest
APP_IMG="genome_profile:latest" ;;
mbb)
#APP_IMG="X.X.X.X:5000/genome_profile:latest" ;;
esac
fi
#essayer une plage de ports entre 8787 et 8800
#APP_PORT=$2
APP_PORT=8787
while [[ $(ss -tulw | grep $APP_PORT) != "" && $APP_PORT < 8800 ]]
do
APP_PORT=$(( $APP_PORT + 1))
done
if [[ $(ss -tulw | grep $APP_PORT) != "" ]]
then
echo "No tcp port available !!"
exit -1
fi
# Docker volumes
#realUSER=$(who am i | awk '{print $1}')
if [ $SUDO_USER ]; then realUSER=$SUDO_USER; else realUSER=`whoami`; fi
VOL_DEV=/media/bigvol/$realUSER
# MBB Workflows reads data from /Data and write results to /Results
mkdir -p ${VOL_DEV}/Data
mkdir -p ${VOL_DEV}/Results
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Data,dst=/Data"
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Results,dst=/Results"
CONTAINER_ID=$( docker run --rm -d -p $APP_PORT:3838 $DOCK_VOL $APP_IMG )
if [ $CONTAINER_ID ]
then
echo " "
echo You have to put your Data on : ${VOL_DEV}/Data
echo " "
echo Results will be written to : ${VOL_DEV}/Results
echo " "
hostname -I | awk -v port=$APP_PORT '{print "You can access the workflow interface at : http://"$1":"port}'
echo " "
echo To start a Bash session inside the container : docker exec -it $CONTAINER_ID /bin/bash
echo " "
echo To run the workflow without the interface : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile config --cores XX
echo " "
echo config est un fichier de configuration qui doit être dans un sous dossier de ${VOL_DEV}/Data ou ${VOL_DEV}/Results
echo " "
echo ex. si fichier dans ${VOL_DEV}/Data/run1/maconfig1.yml : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile /Data/run1/maconfig1.yml --cores XX
echo " "
echo Vous pouvez utiliser l''interface graphique pour générer un fichier de configuration.
echo " "
echo XX étant le nombre de coeurs qui seront utilisés par le workflow.
else
echo Failed to run the docker container !!
fi
\ No newline at end of file
import os
import re
import snakemake.utils
import csv
#############
# Wildcards #
#############
SAMPLES = config["samples"]
STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"]
PARAMS_INFO = config["params_info"]
config = config["params"]
##########
# Inputs #
##########
# Generic input functions
## get raw_reads
def raw_reads():
inputs = dict()
if (config["SeOrPe"] == "PE"):
inputs["read"] = config['sample_dir']+'/{sample}_R1'+config["sample_suffix"]
inputs["read2"] = config['sample_dir']+'/{sample}_R2'+config["sample_suffix"]
elif (config["SeOrPe"] == "SE"):
inputs["read"] = config['sample_dir']+'/{sample}'+config["sample_suffix"]
else:
sys.exit("SeOrPe should be SE or PE")
return inputs
## get reads (trimmed or raw)
def reads():
return raw_reads()
# Tools inputs functions
def fastp_inputs():
return raw_reads()
def jellyfish_count_inputs():
inputs = dict()
if (config["preprocessing"] == "fastp"):
if (config["SeOrPe"] == "PE"):
inputs["read"] = expand(rules.fastp_PE.output.R1,sample=SAMPLES)
inputs["read2"] = expand(rules.fastp_PE.output.R2,sample=SAMPLES)
else:
inputs["read"] = expand(rules.fastp_SE.output.read,sample=SAMPLES)
else:
inputs["read"] = expand(raw_reads()["read"],sample=SAMPLES)
if (config["SeOrPe"] == "PE"):
inputs["read2"] = expand(raw_reads()["read2"],sample=SAMPLES)
return inputs
def prepare_report_inputs():
inputs = list()
for step in STEPS:
inputs.extend(step_outputs(step["name"]))
return inputs
def prepare_report_scripts():
scripts = list()
for step in STEPS:
tool = config[step["name"]]
script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script)
return scripts
def prepare_report_outputs():
outputs = list()
outputs.append(config["results_dir"] + "/outputs_mqc.csv")
for step in STEPS:
tool = config[step["name"]]
if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output)
return outputs
def multiqc_inputs():
# Need prepare_report inputs and outputs in case prepare_reports has no outputs
return prepare_report_outputs()
###########
# Outputs #
###########
def step_outputs(step):
outputs = list()
if (step == "kmer_analysis"):
if (config[step] == "genomescope"):
outputs = rules.genomescope.output
elif (step == "all"):
outputs = list(rules.multiqc.output)
return outputs
# get outputs for each choosen tools
def workflow_outputs(step):
outputs = list()
outputs.extend(step_outputs(step))
return outputs
#########
# Rules #
#########
rule fastp_PE:
input:
**fastp_inputs()
output:
report_html = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/fastp_report_{sample}.html",
report_json = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/fastp_report_{sample}.json",
R1 = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/{sample}_R1.fq.gz",
R2 = config["results_dir"]+"/"+config["fastp_PE_output_dir"]+"/{sample}_R2.fq.gz"
params:
complexity_threshold = config["fastp_complexity_threshold"],
report_title = config["fastp_report_title"],
adapter_sequence = config["fastp_adapter_sequence"],
adapter_sequence_R2 = config["fastp_adapter_sequence_R2_PE"],
P = config["fastp_P"],
output_dir = config["fastp_PE_output_dir"],
correction = "--correction " if config["fastp_correction_PE"] == True else "",
low_complexity_filter = "--low_complexity_filter " if config["fastp_low_complexity_filter"] == True else "",
overrepresentation_analysis = "--overrepresentation_analysis " if config["fastp_overrepresentation_analysis"] == True else "",
log:
config["results_dir"]+"/logs/fastp/{sample}_fastp_log.txt"
threads:
config["fastp_threads"]
shell:
"fastp "
"-i {input.read} "
"-I {input.read2} "
"-o {output.R1} "
"-O {output.R2} "
"-w {threads} "
"{params.correction} "
"{params.low_complexity_filter} "
"--complexity_threshold {params.complexity_threshold} "
"--html {output.report_html} "
"--json {output.report_json} "
"--report_title {params.report_title} "
"--adapter_sequence '{params.adapter_sequence}' "
"--adapter_sequence_r2 '{params.adapter_sequence_R2}' "
"{params.overrepresentation_analysis} "
"-P {params.P} "
"|& tee {log}"
rule fastp_SE:
input:
**fastp_inputs()
output:
report_html = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/fastp_report_{sample}.html",
report_json = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/fastp_report_{sample}.json",
read = config["results_dir"]+"/"+config["fastp_SE_output_dir"]+"/{sample}.fq.gz",
params:
complexity_threshold = config["fastp_complexity_threshold"],
report_title = config["fastp_report_title"],
adapter_sequence = config["fastp_adapter_sequence"],
P = config["fastp_P"],
output_dir = config["fastp_SE_output_dir"],
low_complexity_filter = "--low_complexity_filter " if config["fastp_low_complexity_filter"] == True else "",
overrepresentation_analysis = "--overrepresentation_analysis " if config["fastp_overrepresentation_analysis"] == True else "",
log:
config["results_dir"]+"/logs/fastp/{sample}_fastp_log.txt"
threads:
config["fastp_threads"]
shell:
"fastp "
"-i {input.read} "
"-o {output.R1} "
"-w {threads} "
"{params.low_complexity_filter} "
"--complexity_threshold {params.complexity_threshold} "
"--html {output.report_html} "
"--json {output.report_json} "
"--report_title {params.report_title} "
"--adapter_sequence '{params.adapter_sequence}' "
"{params.overrepresentation_analysis} "
"-P {params.P} "
"|& tee {log}"
rule jellyfish_count:
input:
**jellyfish_count_inputs(),
output:
kmer_counts = config["results_dir"] + "/" + config["jellyfish_count_output_dir"] + "/counts.jf",
params:
canonical_kmer = "-C" if config["jellyfish_count_canonical_kmer"] else "",
kmer_len = config["jellyfish_count_kmer_len"],
hash_size = config["jellyfish_count_hash_size"]
threads:
config["jellyfish_threads"]
log:
config["results_dir"]+'/logs/jellyfish/jellyfish_count_log.txt'
run:
inputs = jellyfish_count_inputs()
files = ""
if (config["SeOrPe"] == "PE"):
for r1,r2 in zip(inputs["read"],inputs["read2"]):
files += "<(zcat -f "+r1+") <(zcat -f "+r2+") "
else:
for r in inputs["read"]:
files += "<(zcat -f "+r+") "
shell(
"jellyfish count "+
"{params.canonical_kmer} "+
"-m {params.kmer_len} "+
"-s {params.hash_size} "+
"-t {threads} "+
"-o {output.kmer_counts} "+
files+
"|& tee {log}"
)
rule jellyfish_histo:
input:
kmer_counts = rules.jellyfish_count.output.kmer_counts,
output:
kmer_histo = config["results_dir"] + "/" + config["jellyfish_histo_output_dir"] + "/kmer_histo_jf.hist",
threads:
config["jellyfish_threads"]
shell:
"jellyfish histo "
"-t {threads} "
"{input.kmer_counts} > {output.kmer_histo} "
rule genomescope:
input:
kmer_histo = rules.jellyfish_histo.output.kmer_histo
output:
GenomeScope_Profile = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Profile_mqc.png",
GenomeScope_Profile_log_scale = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Profile_log_scale_mqc.png",
Summary = config["results_dir"] + "/" + config["genomescope_output_dir"] + "/GenomeScope_Summary_mqc.csv"
params:
output_dir = config["results_dir"] + "/" + config["genomescope_output_dir"],
kmer_len = config["jellyfish_count_kmer_len"],
reads_len = config["genomescope_reads_len"]
log:
config["results_dir"]+'/logs/genomescope/genomescope_log.txt'
shell:
"Rscript /opt/biotools/genomescope.R "
"{input.kmer_histo} "
"{params.kmer_len} "
"{params.reads_len} "
"{params.output_dir} "
"|& tee {log};"
# prepare mqc custom content
"mv {params.output_dir}/plot.png {params.output_dir}/GenomeScope_Profile_mqc.png && "
"mv {params.output_dir}/plot.log.png {params.output_dir}/GenomeScope_Profile_log_scale_mqc.png && "
"tail -n +4 {params.output_dir}/summary.txt | sed 's/ \{{2,\}}/\t/g' | sed 's/\t$//g' > {params.output_dir}/GenomeScope_Summary_mqc.csv"
rule prepare_report:
input:
*prepare_report_inputs(),
output:
*prepare_report_outputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml",
params_tab = config["results_dir"] + "/params_tab_mqc.csv"
params:
params_file = config["results_dir"]+"/params.yml",
results_dir = config["results_dir"]
log:
config["results_dir"]+"/logs/prepare_report_log.txt"
run:
# Specific scripts for each tool
for script in prepare_report_scripts():
shell("Rscript "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
outfile = config["results_dir"] + "/outputs_mqc.csv"
head = """
# description: 'This is the list of the files generated by each step of the workflow'
# section_name: 'Workflow outputs'
"""
with open(outfile,"w") as out:
out.write(head)
out.write("step\ttool\tfile\tdescription\n")#\tname
for step in STEPS:
tool = config[step["name"]]
i=1
for command in OUTPUTS[tool]:
if ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
outputs = OUTPUTS[tool][command]
for files in outputs:
name = files["file"] if 'file' in files.keys() else files["directory"]
path = config[command+"_output_dir"] + "/" + name #config["results_dir"] +"/"+
out.write(str(i)+"-"+step["title"]+"\t"+tool+"\t"+path+"\t"+files["description"]+"\n")#"\t"+files["name"]+
i+=1
# Params list for Multiqc report
params_list = "params_name\tdescription\tvalue\n"
head = """# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters'
"""
for step in STEPS:
tool = config[step["name"]]
for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]) and ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
if (key in PARAMS_INFO.keys() and "label" in PARAMS_INFO[key].keys()):
description = PARAMS_INFO[key]["label"]
else:
description = ''
params_list += key + "\t'" + description + "'\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out:
out.write(head)
out.write(params_list)
# Config for Multiqc report
shell("python3 /workflow/generate_multiqc_config.py {params.params_file} {output.config_multiqc}")
rule multiqc:
input:
multiqc_inputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml"
output:
multiqc_dir = directory(config["results_dir"]+"/multiqc_data")
params:
output_dir = config["results_dir"]
log:
config["results_dir"]+'/logs/multiqc/multiqc_log.txt'
shell:
"multiqc --config {input.config_multiqc} -f {params.output_dir} "
"-o {params.output_dir} |& tee {log}"
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all:
input:
workflow_outputs("all")
output:
Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py",
scripts = directory(config["results_dir"]+"/workflow/scripts"),
params = config["results_dir"]+"/workflow/params.yml"
params:
params_file = config["results_dir"]+"/params.yml",
shell:
"cp /workflow/Snakefile {output.Snakefile} && "
"cp /workflow/get_samples.py {output.get_samples} && "
"cp -r /workflow/scripts {output.scripts} && "
"cp {params.params_file} {output.params}"
onsuccess:
print("Workflow finished, no error")
shell("touch "+config["results_dir"]+"/logs/workflow_end.ok")
onerror:
print("An error occurred")
shell("cat {log} > "+config["results_dir"]+"/logs/workflow_end.error")
#shell("mail -s "an error occurred" youremail@provider.com < {log}")
import re
import sys
from tools import *
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("file" in output.keys() and "mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":