Commit 064f5da3 authored by mmassaviol's avatar mmassaviol
Browse files

Put interop_report workflow as a submodule

parent 77673dc1
......@@ -40,3 +40,6 @@
[submodule "microbial_metagenome"]
path = microbial_metagenome
url = https://gitlab.mbb.univ-montp2.fr/mmassaviol/microbial_metagenome
[submodule "interop_report"]
path = interop_report
url = https://gitlab.mbb.univ-montp2.fr/mmassaviol/interop_report
\ No newline at end of file
FROM mmassaviol/mbb_workflows_base:latest as alltools
ENV PATH /opt/biotools/InterOp-1.1.8-Linux-GNU/bin/:$PATH
RUN cd /opt/biotools \
&& wget https://github.com/Illumina/interop/releases/download/v1.1.8/InterOp-1.1.8-Linux-GNU.tar.gz \
&& tar -xvzf InterOp-1.1.8-Linux-GNU.tar.gz \
&& rm InterOp-1.1.8-Linux-GNU.tar.gz
RUN apt install -y gnuplot
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
#This part is necessary to run on ISEM cluster
RUN mkdir -p /share/apps/bin \
&& mkdir -p /share/apps/lib \
&& mkdir -p /share/apps/gridengine \
&& mkdir -p /share/bio \
&& mkdir -p /opt/gridengine \
&& mkdir -p /export/scrach \
&& mkdir -p /usr/lib64 \
&& ln -s /bin/bash /bin/mbb_bash \
&& ln -s /bin/bash /bin/isem_bash \
&& /usr/sbin/groupadd --system --gid 400 sge \
&& /usr/sbin/useradd --system --uid 400 --gid 400 -c GridEngine --shell /bin/true --home /opt/gridengine sge
EXPOSE 3838
CMD ["Rscript", "-e", "setwd('/sagApp/'); shiny::runApp('/sagApp/app.R',port=3838 , host='0.0.0.0')"]
FROM alltools
COPY files /workflow
COPY sagApp /sagApp
#!/bin/bash
#This script will help to run a workflow in a docker image.
if [ $# -lt 4 ]
then
echo usage : $0 dataDir resultsDir configFile nbCores '[dockerHub|local]'
exit
fi
# Docker volumes
# MBB Workflows reads data from /Data and write results to /Results
Data=$1
Results=$2
if [ ! -d "$Data" ]; then
echo "can't find $Data directory !"
exit;
fi
mkdir -p $Results
DOCK_VOL+=" --mount type=bind,src=$Data,dst=/Data"
DOCK_VOL+=" --mount type=bind,src=$Results,dst=/Results"
# config file must be in /Data or /Results !
config=$3
cores=$4
if [ $# -lt 5 ]
then
APP_IMG="mbbteam/interopreport:latest"
else
IMG_SRC=$5
case $IMG_SRC in
dockerHub )
APP_IMG="mbbteam/interopreport:latest" ;;
local)
docker build . -t interopreport:latest
APP_IMG="interopreport:latest" ;;
mbb)
#APP_IMG="X.X.X.X:5000/interopreport:latest" ;;
esac
fi
docker run --rm $DOCK_VOL --cidfile="CID.txt" $APP_IMG snakemake -s /workflow/Snakefile all --configfile $config --cores $cores
CONTAINER_ID=$(cat CID.txt)
if [ $CONTAINER_ID ]
then
echo " "
echo Results were written to : $2
echo " "
else
echo Failed to run the docker container !!
fi
#!/bin/bash
#This script will help a deployment of a docker image on an MBB bigmem machine
if [ $# -lt 2 ]
then
echo usage : $0 dataDir resultsDir '[dockerHub|local]'
exit
fi
#nginx
##### nginx install #####
#sudo apt-get install -y nginx
# HOST_NAME="192.168.100.49"
# HTTP_ENDP="https://$HOST_NAME"
# openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/ssl/private/nginx-selfsigned.key -out /etc/ssl/certs/nginx-selfsigned.crt -subj "/C=FR/ST=LR/L=Montpellier/O=CNRS/OU=CNRS-ISEM/CN=mbb.univ-montp2.fr"
# openssl dhparam -out /etc/ssl/certs/dhparam.pem 2048
# mkdir -p /etc/nginx/snippets
# echo "ssl_certificate /etc/ssl/certs/nginx-selfsigned.crt;" > /etc/nginx/snippets/self-signed.conf
# echo "ssl_certificate_key /etc/ssl/private/nginx-selfsigned.key;" >> /etc/nginx/snippets/self-signed.conf
# cp system/nginx_snippets_ssl-params.conf /etc/nginx/snippets/ssl-params.conf
# cp /etc/nginx/sites-available/default /etc/nginx/sites-available/default.bak
# cp system/nginx_sites-available_default /etc/nginx/sites-available/default
# sed -i "s|server_domain_or_IP|$HOST_NAME|" /etc/nginx/sites-available/default
# useradd nginx
# cp system/nginx_nginx.conf /etc/nginx/nginx.conf
# cp system/nginx_conf.d_10-rstudio.conf /etc/nginx/conf.d/10-rstudio.conf
# sed -i "s|example.com|$HOST_NAME|" /etc/nginx/conf.d/10-rstudio.conf
# systemctl restart nginx
# systemctl enable nginx
#essayer une plage de ports entre 8787 et 8800
#APP_PORT=$2
APP_PORT=8787
while [[ $(ss -tulw | grep $APP_PORT) != "" && $APP_PORT < 8800 ]]
do
APP_PORT=$(( $APP_PORT + 1))
done
if [[ $(ss -tulw | grep $APP_PORT) != "" ]]
then
echo "No tcp port available !!"
exit -1
fi
# Docker volumes
# MBB Workflows reads data from /Data and write results to /Results
if [ $SUDO_USER ]; then realUSER=$SUDO_USER; else realUSER=`whoami`; fi
Data=$1
Results=$2
mkdir -p $Data
mkdir -p $Results
DOCK_VOL+=" --mount type=bind,src=$Data,dst=/Data"
DOCK_VOL+=" --mount type=bind,src=$Results,dst=/Results"
if [ $# -lt 3 ]
then
APP_IMG="mbbteam/interopreport:latest"
else
IMG_SRC=$3
case $IMG_SRC in
dockerHub )
APP_IMG="mbbteam/interopreport:latest" ;;
local)
docker build . -t interopreport:latest
APP_IMG="interopreport:latest" ;;
mbb)
#APP_IMG="X.X.X.X:5000/interopreport:latest" ;;
esac
fi
CONTAINER_ID=$( docker run --rm -d -p $APP_PORT:3838 $DOCK_VOL $APP_IMG )
if [ $CONTAINER_ID ]
then
echo " "
echo You have to put your Data on : $1
echo " "
echo Results will be written to : $2
echo " "
hostname -I | awk -v port=$APP_PORT '{print "You can access the workflow interface at : http://"$1":"port}'
echo " "
echo To start a Bash session inside the container : docker exec -it $CONTAINER_ID /bin/bash
else
echo Failed to run the docker container !!
fi
#!/bin/bash
# This script is executed on the virtual machine during the *Deployment* phase.
# It is used to apply parameters specific to the current deployment.
# It is executed secondly during a cloud deployement in IFB-Biosphere, after the *Installation* phase.
if [ $# -lt 1 ]
then
APP_IMG="mbbteam/interopreport:latest"
else
IMG_SRC=$1
case $IMG_SRC in
ifb)
APP_IMG="gitlab-registry.in2p3.fr/ifb-biosphere/apps/interopreport:master" ;;
docker )
APP_IMG="mbbteam/interopreport:latest" ;;
local)
docker build . -t interopreport:latest
APP_IMG="interopreport:latest" ;;
mbb)
#APP_IMG="X.X.X.X:5000/interopreport:latest" ;;
esac
fi
# Tuning if site proxy or not
#CLOUD_SERVICE = $(ss-get cloudservice)
#CLOUD_SERVICE="ifb-genouest-genostack"
#HOST_NAME=$( ss-get --timeout=3 hostname )
HOST_NAME="192.168.100.49"
#if [ "$CLOUD_SERVICE" == "ifb-genouest-genostack" ]; then
# Cloud site WITH a site proxy
# APP_PORT=80
# PROXIED_IP=$( echo $HOST_NAME | sed "s|\.|-|g")
# HOST_NAME="openstack-${PROXIED_IP}.genouest.org"
# HTTP_ENDP="https://$HOST_NAME"
# systemctl stop nginx
#else
# Cloud site WOUT a site proxy
APP_PORT=8787
HTTP_ENDP="https://$HOST_NAME"
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout /etc/ssl/private/nginx-selfsigned.key -out /etc/ssl/certs/nginx-selfsigned.crt -subj "/C=FR/ST=AURA/L=Lyon/O=IFB/OU=IFB-biosphere/CN=myrstudio.biosphere.france-bioinformatique.fr"
openssl dhparam -out /etc/ssl/certs/dhparam.pem 2048
mkdir -p /etc/nginx/snippets
echo "ssl_certificate /etc/ssl/certs/nginx-selfsigned.crt;" > /etc/nginx/snippets/self-signed.conf
echo "ssl_certificate_key /etc/ssl/private/nginx-selfsigned.key;" >> /etc/nginx/snippets/self-signed.conf
cp system/nginx_snippets_ssl-params.conf /etc/nginx/snippets/ssl-params.conf
cp /etc/nginx/sites-available/default /etc/nginx/sites-available/default.bak
cp system/nginx_sites-available_default /etc/nginx/sites-available/default
sed -i "s|server_domain_or_IP|$HOST_NAME|" /etc/nginx/sites-available/default
useradd nginx
cp system/nginx_nginx.conf /etc/nginx/nginx.conf
cp system/nginx_conf.d_10-rstudio.conf /etc/nginx/conf.d/10-rstudio.conf
sed -i "s|example.com|$HOST_NAME|" /etc/nginx/conf.d/10-rstudio.conf
systemctl restart nginx
systemctl enable nginx
#fi
# Docker volumes
# mydatalocal: from the system disk or ephemeral one
IFB_DATADIR="/ifb/data/"
source /etc/profile.d/ifb.sh
VOL_NAME="mydatalocal"
VOL_DEV=$(readlink -f -n $IFB_DATADIR/$VOL_NAME )
DOCK_VOL=" --mount type=bind,src=$VOL_DEV,dst=$IFB_DATADIR/$VOL_NAME"
# MBB Workflows reads data from /Data and write results to /Results
mkdir ${VOL_DEV}/Data
mkdir ${VOL_DEV}/Results
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Data,dst=/Data"
DOCK_VOL+=" --mount type=bind,src=$VOL_DEV/Results,dst=/Results"
# NFS mounts: from ifb_share configuration in autofs
IFS_ORI=$IFS
while IFS=" :" read VOL_NAME VOL_TYPE VOL_IP VOL_DEV ; do
DOCK_VOL+=" --mount type=volume,volume-driver=local,volume-opt=type=nfs,src=$VOL_NAME,dst=$IFB_DATADIR/$VOL_NAME,volume-opt=device=:$VOL_DEV,volume-opt=o=addr=$VOL_IP"
done < /etc/auto.ifb_share
IFS=$IFS_ORI
CONTAINER_ID=$( docker run -d -p $APP_PORT:3838 $DOCK_VOL $APP_IMG )
VM_IP=$(curl bot.whatismyipaddress.com)
if [ $CONTAINER_ID ]
then
echo " "
echo You have to put your Data on : ${VOL_DEV}/Data
echo " "
echo Results will be written to : ${VOL_DEV}/Results
echo " "
echo You can access the workflow interface at : https://${VM_IP}
echo " "
echo To start a Bash session inside the container : docker exec -it $CONTAINER_ID /bin/bash
echo " "
echo To run the workflow without the interface : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile config --cores XX
echo " "
echo config est un fichier de configuration qui doit être dans un sous dossier de ${VOL_DEV}/Data ou ${VOL_DEV}/Results
echo " "
echo ex. si fichier dans ${VOL_DEV}/Data/run1/maconfig1.yml : docker exec -it $CONTAINER_ID snakemake -s /workflow/Snakefile all --configfile /Data/run1/maconfig1.yml --cores XX
echo " "
echo Vous pouvez utiliser l''interface graphique pour générer un fichier de configuration.
echo " "
echo XX étant le nombre de coeurs qui seront utilisés par le workflow.
else
echo Failed to run the docker container !!
fi
import os
import re
import snakemake.utils
import csv
#############
# Wildcards #
#############
STEPS = config["steps"]
PREPARE_REPORT_OUTPUTS = config["prepare_report_outputs"]
PREPARE_REPORT_SCRIPTS = config["prepare_report_scripts"]
OUTPUTS = config["outputs"]
PARAMS_INFO = config["params_info"]
config = config["params"]
##########
# Inputs #
##########
# Tools inputs functions
def interop_inputs():
inputs = dict()
inputs["runinfo"] = config["interop_analysis_dir"]+"/RunInfo.xml"
return inputs
def prepare_report_inputs():
inputs = list()
for step in STEPS:
inputs.extend(step_outputs(step["name"]))
return inputs
def prepare_report_scripts():
scripts = list()
for step in STEPS:
tool = config[step["name"]]
script = tool+".prepare.report.R"
if (script in PREPARE_REPORT_SCRIPTS):
scripts.append("/workflow/scripts/"+script)
return scripts
def prepare_report_outputs():
outputs = list()
outputs.append(config["results_dir"] + "/outputs_mqc.csv")
for step in STEPS:
tool = config[step["name"]]
if (tool in PREPARE_REPORT_OUTPUTS.keys()):
for output in PREPARE_REPORT_OUTPUTS[tool]:
outputs.append(config["results_dir"]+"/"+tool+"/"+output)
return outputs
def multiqc_inputs():
# Need prepare_report inputs and outputs in case prepare_reports has no outputs
return prepare_report_outputs()
###########
# Outputs #
###########
def step_outputs(step):
outputs = list()
if (step == "interop_read_metrics"):
outputs = rules.interop.output
elif (step == "all"):
outputs = list(rules.multiqc.output)
return outputs
# get outputs for each choosen tools
def workflow_outputs(step):
outputs = list()
outputs.extend(step_outputs(step))
return outputs
#########
# Rules #
#########
rule interop:
input:
**interop_inputs()
output:
summary = config["results_dir"]+"/"+config["interop_output_dir"]+"/summary.csv",
index_summary = config["results_dir"]+"/"+config["interop_output_dir"]+"/index_summary.csv",
dump_text = config["results_dir"]+"/"+config["interop_output_dir"]+"/dump_text.csv",
log: config["results_dir"]+'/logs/interop/interop_log.txt'
params:
analysis_dir = config["interop_analysis_dir"],
output_dir = config["results_dir"]+"/"+config["interop_output_dir"],
shell:
"cd {params.output_dir}; "
"summary {params.analysis_dir} | sed 's/ *,/,/g' > summary.csv " #remove unneeded whitespaces
"&& index-summary {params.analysis_dir} | sed 's/ \{{2,\}}/,/g' | sed 's/ CV/,CV/' | sed 's/^ //g' | sed 's/,$//g' > index_summary.csv " # format as a csv for multiqc
"&& dumptext {params.analysis_dir} > dump_text.csv "
"&& imaging_table {params.analysis_dir} | sed 's/;/,/g' > imaging_table.csv "
"&& plot_qscore_heatmap {params.analysis_dir} | sed \"s/set output '.*'/set output 'Qscore_heatmap_mqc.png'/\" | gnuplot "
"&& plot_qscore_histogram {params.analysis_dir} | sed \"s/set output '.*'/set output 'Qscore_histogram_mqc.png'/\" | gnuplot "
"&& plot_by_cycle {params.analysis_dir} | sed \"s/set output '.*'/set output 'Intensity_by_cycle_mqc.png'/\" | gnuplot "
"&& plot_by_lane {params.analysis_dir} | sed \"s/set output '.*'/set output 'Cluster_count_by_lane_mqc.png'/\" | gnuplot "
"&& plot_flowcell {params.analysis_dir} | sed \"s/set output '.*'/set output 'Flowcell_intensity_mqc.png'/\" | gnuplot "
rule prepare_report:
input:
*prepare_report_inputs(),
output:
*prepare_report_outputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml",
params_tab = config["results_dir"] + "/params_tab_mqc.csv"
params:
params_file = workflow.overwrite_configfile,
results_dir = config["results_dir"]
log:
config["results_dir"]+"/logs/prepare_report_log.txt"
run:
# Specific scripts for each tool
for script in prepare_report_scripts():
shell("Rscript "+script+" {params.params_file} |& tee {log}")
# Outputs files for Multiqc report
outfile = config["results_dir"] + "/outputs_mqc.csv"
head = """
# description: 'This is the list of the files generated by each step of the workflow'
# section_name: 'Workflow outputs'
"""
with open(outfile,"w") as out:
out.write(head)
out.write("step\ttool\tfile\tdescription\n")#\tname
for step in STEPS:
tool = config[step["name"]]
i=1
for command in OUTPUTS[tool]:
if ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
outputs = OUTPUTS[tool][command]
for files in outputs:
name = files["file"] if 'file' in files.keys() else files["directory"]
path = config[command+"_output_dir"] + "/" + name #config["results_dir"] +"/"+
out.write(str(i)+"-"+step["title"]+"\t"+tool+"\t"+path+"\t"+files["description"]+"\n")#"\t"+files["name"]+
i+=1
# Params list for Multiqc report
params_list = "params_name\tdescription\tvalue\n"
head = """# description: 'This is the list of the parameters for each rule'
# section_name: 'Workflow parameters'
"""
for step in STEPS:
tool = config[step["name"]]
for key, value in config.items():
if (tool in key and tool != "null") or (key in ["results_dir","sample_dir","sample_suffix","SeOrPe"]) and ((config["SeOrPe"] == "SE" and not("_PE" in command)) or (config["SeOrPe"] == "PE" and not("_SE" in command))):
if (key in PARAMS_INFO.keys() and "label" in PARAMS_INFO[key].keys()):
description = PARAMS_INFO[key]["label"]
else:
description = ''
params_list += key + "\t'" + description + "'\t'" + str(value) + "'\n"
with open(output.params_tab,"w") as out:
out.write(head)
out.write(params_list)
# Config for Multiqc report
shell("python3 /workflow/generate_multiqc_config.py {params.params_file} {output.config_multiqc}")
rule multiqc:
input:
multiqc_inputs(),
config_multiqc = config["results_dir"] + "/config_multiqc.yaml"
output:
multiqc_dir = directory(config["results_dir"]+"/multiqc_data")
params:
output_dir = config["results_dir"]
log:
config["results_dir"]+'/logs/multiqc/multiqc_log.txt'
shell:
"multiqc --config {input.config_multiqc} -f {params.output_dir} "
"-o {params.output_dir} |& tee {log}"
# Final Snakemake rule waiting for outputs of the final step choosen by user (default all steps)
rule all:
input:
workflow_outputs("all")
output:
Snakefile = config["results_dir"]+"/workflow/Snakefile",
get_samples = config["results_dir"]+"/workflow/get_samples.py",
scripts = directory(config["results_dir"]+"/workflow/scripts"),
params = config["results_dir"]+"/workflow/params.yml"
params:
params_file = workflow.overwrite_configfile,
shell:
"cp /workflow/Snakefile {output.Snakefile} && "
"cp /workflow/get_samples.py {output.get_samples} && "
"cp -r /workflow/scripts {output.scripts} && "
"cp {params.params_file} {output.params}"
onsuccess:
print("Workflow finished, no error")
shell("touch "+config["results_dir"]+"/logs/workflow_end.ok")
onerror:
print("An error occurred")
shell("cat {log} > "+config["results_dir"]+"/logs/workflow_end.error")
#shell("mail -s "an error occurred" youremail@provider.com < {log}")
import re
import sys
from tools import *
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("file" in output.keys() and "mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
#!/usr/bin/env python3
# This script will take a directory and a parameter to tell if the reads are paired end or single end and return the sample list and the suffix
# Needs 2 arguments: reads_directory, SeOrPe
# SeOrPe is SE for single end reads and PE for paired end reads
# Usage: ./get_samples.py reads_directory SeOrPe
import os
import re
import csv
import sys
def sample_list(dir, SeOrPe):
samples = list()
suffixes = list()
files = os.listdir(dir)
if SeOrPe == "PE":
regex = re.compile(r"^(.+?)(_R1|_R2)(.+)")
else:
regex = re.compile(r"^(.+?)(\..*)")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
if SeOrPe == "PE":
suffixes.append(res.group(3))
else:
suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ):
return {'samples': sorted(samples), 'suffix': list(set(suffixes))[0]}
else:
exit("Files have different suffixes:" + ','.join(suffixes))
def main():
if len(sys.argv) == 3:
print(sample_list(sys.argv[1],sys.argv[2]))
else:
exit("""Needs 2 arguments: reads_directory, SeOrPe
Usage: ./get_samples.py reads_directory SeOrPe""")
if __name__ == "__main__":
# execute only if run as a script
main()
pipeline: Interop_report
params:
results_dir: /Results
sample_dir: /Data
SeOrPe: PE
interop_read_metrics: interop
interop_output_dir: interop
interop_analysis_dir: /Data
samples: []
groups: []
steps: