Commit 53aceb58 authored by khalid's avatar khalid
Browse files

Added missing files

parent 1fc84b7e
FROM mmassaviol/mbb_workflows_base:latest
COPY files /workflow
COPY sagApp /sagApp
RUN apt install -y fastqc=0.11.5+dfsg-6
RUN cd /opt/biotools \
&& wget http://catchenlab.life.illinois.edu/stacks/source/stacks-2.3b.tar.gz \
&& tar -zxvf stacks-2.3b.tar.gz \
&& cd stacks-2.3b/ \
&& ./configure \
&& make -j 10 \
&& make install \
&& mv -t ../bin sstacks kmer_filter gstacks tsv2bam process_shortreads populations ustacks clone_filter phasedstacks cstacks process_radtags \
&& cd .. && rm -r stacks-2.3b stacks-2.3b.tar.gz
RUN wget -O bowtie-1.2.3-linux-x86_64.zip https://sourceforge.net/projects/bowtie-bio/files/bowtie/1.2.3/bowtie-1.2.3-linux-x86_64.zip/download \
&& unzip bowtie-1.2.3-linux-x86_64.zip \
&& cp bowtie-1.2.3-linux-x86_64/bowtie* /usr/bin \
&& rm -rf bowtie-1.2.3*
RUN cd /opt/biotools \
&& wget https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 \
&& tar -xvjf samtools-1.9.tar.bz2 \
&& cd samtools-1.9 \
&& ./configure && make \
&& cd .. \
&& mv samtools-1.9/samtools bin/samtools \
&& rm -r samtools-1.9 samtools-1.9.tar.bz2
RUN cd /opt/biotools \
&& wget https://github.com/lh3/bwa/releases/download/v0.7.17/bwa-0.7.17.tar.bz2 \
&& tar -xvjf bwa-0.7.17.tar.bz2 \
&& cd bwa-0.7.17 \
&& make -j 10 \
&& mv bwa ../bin/ \
&& cd .. \
&& rm -r bwa-0.7.17 bwa-0.7.17.tar.bz2
RUN Rscript -e 'install.packages("calibrate",repos="https://cloud.r-project.org/",Ncpus=8, clean=TRUE)'
RUN Rscript -e 'BiocManager::install("SNPRelate", version = "3.8",Ncpus=8, clean=TRUE)'
EXPOSE 3838
CMD ["Rscript", "-e", "setwd('/sagApp/'); shiny::runApp('/sagApp/app.R',port=3838 , host='0.0.0.0')"]
This diff is collapsed.
import re
import sys
from tools import *
config = read_yaml(sys.argv[1])
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
res += " order: 980\n"
res += " outputs:\n"
res += " order: 970\n"
cpt = 960
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
if ((config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
if("file" in output.keys() and "mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
\ No newline at end of file
#!/usr/bin/env python3
# This script will take a directory and a parameter to tell if the reads are paired end or single end and return the sample list and the suffix
# Needs 2 arguments: reads_directory, SeOrPe
# SeOrPe is SE for single end reads and PE for paired end reads
# Usage: ./get_samples.py reads_directory SeOrPe
import os
import re
import csv
import sys
def sample_list(dir, SeOrPe):
samples = list()
suffixes = list()
files = os.listdir(dir)
if SeOrPe == "PE":
regex = re.compile(r"^(.+?)(_R1|_R2)(.+)")
else:
regex = re.compile(r"^(.+?)(\..*)")
for file in files:
res = re.match(regex, file)
if res:
if res.group(1) not in samples:
samples.append(res.group(1))
if SeOrPe == "PE":
suffixes.append(res.group(3))
else:
suffixes.append(res.group(2))
if (len(set(suffixes)) == 1 ):
return {'samples': sorted(samples), 'suffix': list(set(suffixes))[0]}
else:
exit("Files have different suffixes:" + ','.join(suffixes))
def main():
if len(sys.argv) == 3:
print(sample_list(sys.argv[1],sys.argv[2]))
else:
exit("""Needs 2 arguments: reads_directory, SeOrPe
Usage: ./get_samples.py reads_directory SeOrPe""")
if __name__ == "__main__":
# execute only if run as a script
main()
pipeline: RADseq_ref
params:
results_dir: /Results
sample_dir: /Data
SeOrPe: PE
quality_check: fastqc
fastqc_SE_output_dir: fastqc_SE
fastqc_PE_output_dir: fastqc_PE
fastqc_threads: 4
null_output_dir: ''
demultiplexing: process_radtags
process_radtags_SE_output_dir: process_radtags/SE
process_radtags_barcode_file_select: server
process_radtags_barcode_file: ''
process_radtags_barcode_type: --inline_null
process_radtags_enzyme_SE: aciI
process_radtags_PE_output_dir: process_radtags/PE
process_radtags_enzyme_1_PE: aciI
process_radtags_enzyme_2_PE: ''
mapping: bowtie
bowtie_index_output_dir: bowtie/index
bowtie_index_genome_fasta: ''
bowtie_index_path: ''
bowtie_index_genome_fasta_select: server
bowtie_index_threads: 4
bowtie_PE_output_dir: bowtie/PE
index: bwa/index/index
bowtie_threads: 4
bowtie_minins_PE: 0
bowtie_maxins_PE: 250
bowtie_orientation_PE: ''
bowtie_mult_align_limit: 1
bowtie_best: true
bowtie_strata: true
bowtie_SE_output_dir: bowtie/SE
bwa_index_output_dir: bwa/index
bwa_index_genome_fasta: ''
bwa_index_path: ''
bwa_index_genome_fasta_select: server
bwa_index_algorithm: bwtsw
bwa_mem_PE_output_dir: bwa/mem/PE
bwa_mem_threads: 4
bwa_mem_quality0_multimapping: true
bwa_mem_SE_output_dir: bwa/mem/SE
mapping_check: samtools_stats
samtools_stats_output_dir: samtools_stats
samtools_stats_threads: 4
gstacks: gstacks
gstacks_output_dir: gstacks
gstacks_threads: 4
gstacks_population_tsv_select: server
gstacks_population_tsv: ''
gstacks_model: marukilow
gstacks_var_alpha: 0.05
gstacks_gt_alpha: 0.05
gstacks_min_mapq: 20
gstacks_max_clipped: 0.2
populations: populations
populations_output_dir: populations
populations_threads: 4
populations_r: 0.7
populations_max_obs_het: 1
populations_min_maf: 0.05
populations_p: 2
samples: []
groups: []
steps:
- title: Quality check
name: quality_check
tools:
- fastqc
- 'null'
default: fastqc
- title: Demultiplexing
name: demultiplexing
tools:
- process_radtags
- 'null'
default: process_radtags
- title: Mapping
name: mapping
tools:
- bowtie
- bwa
default: bowtie
- title: Mapping check
name: mapping_check
tools:
- samtools_stats
default: samtools_stats
- title: Gstacks
name: gstacks
tools:
- gstacks
default: gstacks
- title: Populations
name: populations
tools:
- populations
default: populations
params_info:
results_dir:
type: output_dir
sample_dir:
type: input_dir
SeOrPe:
type: radio
fastqc_threads:
tool: fastqc
rule: fastqc_PE
type: numeric
label: Number of threads to use
process_radtags_barcode_file_select:
tool: process_radtags
rule: process_radtags_PE
type: select
process_radtags_barcode_file:
tool: process_radtags
rule: process_radtags_PE
type: input_file
label: Barcode file
process_radtags_barcode_type:
tool: process_radtags
rule: process_radtags_PE
type: select
label: Barcode position
process_radtags_enzyme_SE:
tool: process_radtags
rule: process_radtags_SE
type: select
label: Provide the restriction enzyme used
process_radtags_enzyme_1_PE:
tool: process_radtags
rule: process_radtags_PE
type: select
label: Provide the restriction enzyme used
process_radtags_enzyme_2_PE:
tool: process_radtags
rule: process_radtags_PE
type: select
label: If a double digest was used, provide the second restriction enzyme used
bowtie_index_path:
tool: bowtie
rule: bowtie_index
type: input_dir
label: Path to an existing bowtie index (or where to save a new one)
bowtie_index_genome_fasta_select:
tool: bowtie
rule: bowtie_index
type: select
bowtie_index_genome_fasta:
tool: bowtie
rule: bowtie_index
type: input_file
label: Path to reference genome fasta file
bowtie_index_threads:
tool: bowtie
rule: bowtie_index
type: numeric
label: Number of threads to use to index genome
bowtie_threads:
tool: bowtie
rule: bowtie_SE
type: numeric
label: Number of threads to use to align reads
bowtie_minins_PE:
tool: bowtie
rule: bowtie_PE
type: numeric
label: The minimum insert size for valid paired-end alignments
bowtie_maxins_PE:
tool: bowtie
rule: bowtie_PE
type: numeric
label: The maximum insert size for valid paired-end alignments
bowtie_orientation_PE:
tool: bowtie
rule: bowtie_PE
type: radio
label: The upstream/downstream mate orientations for a valid paired-end alignment
against the forward reference strand.
bowtie_mult_align_limit:
tool: bowtie
rule: bowtie_SE
type: numeric
label: Suppress all alignments for a particular read or pair if more than 'x'
reportable alignments exist for it
bowtie_best:
tool: bowtie
rule: bowtie_SE
type: checkbox
label: '--best : Make Bowtie guarantee that reported singleton alignments are
''best'' in terms of stratum (i.e. number of mismatches, or mismatches in the
seed in the case of -n mode) and in terms of the quality values at the mismatched
position(s).'
bowtie_strata:
tool: bowtie
rule: bowtie_SE
type: checkbox
label: '--strata : If many valid alignments exist and are reportable (e.g. are
not disallowed via the -k option) and they fall into more than one alignment
''stratum'', report only those alignments that fall into the best stratum. When
--strata is specified, --best must also be specified.'
bwa_index_path:
tool: bwa
rule: bwa_index
type: input_dir
label: Path to an existing bwa index (or where to save a new one)
bwa_index_genome_fasta_select:
tool: bwa
rule: bwa_index
type: select
bwa_index_genome_fasta:
tool: bwa
rule: bwa_index
type: input_file
label: Path to reference genome fasta file
bwa_index_algorithm:
tool: bwa
rule: bwa_index
type: radio
label: Algorithm for constructing BWT index (see documentation for details)
bwa_mem_threads:
tool: bwa
rule: bwa_mem_SE
type: numeric
label: Number of threads to use
bwa_mem_quality0_multimapping:
tool: bwa
rule: bwa_mem_SE
type: checkbox
label: Put 0 as mapping quality for multimapping reads
samtools_stats_threads:
tool: samtools_stats
rule: samtools_stats
type: numeric
label: Number of threads to use
gstacks_threads:
tool: gstacks
rule: gstacks
type: numeric
label: Number of threads to use
gstacks_population_tsv_select:
tool: gstacks
rule: gstacks
type: select
gstacks_population_tsv:
tool: gstacks
rule: gstacks
type: input_file
label: Path to population tsv file
gstacks_model:
tool: gstacks
rule: gstacks
type: select
label: Model to use to call variants and genotypes
gstacks_var_alpha:
tool: gstacks
rule: gstacks
type: numeric
label: Alpha threshold for discovering SNPs
gstacks_gt_alpha:
tool: gstacks
rule: gstacks
type: numeric
label: Alpha threshold for calling genotypes
gstacks_min_mapq:
tool: gstacks
rule: gstacks
type: numeric
label: Minimum PHRED-scaled mapping quality to consider a read
gstacks_max_clipped:
tool: gstacks
rule: gstacks
type: numeric
label: Maximum soft-clipping level, in fraction of read length
populations_threads:
tool: populations
rule: populations
type: numeric
label: Number of threads to use
populations_r:
tool: populations
rule: populations
type: numeric
label: Minimum percentage of individuals in a population required to process a
locus for that population
populations_max_obs_het:
tool: populations
rule: populations
type: numeric
label: Specify a maximum observed heterozygosity required to process a nucleotide
site at a locus
populations_min_maf:
tool: populations
rule: populations
type: numeric
label: Specify a minimum minor allele frequency required to process a nucleotide
site at a locus
populations_p:
tool: populations
rule: populations
type: numeric
label: Minimum number of populations a locus must be present in to process a locus
prepare_report_scripts:
- populations.prepare.report.R
prepare_report_outputs:
populations:
- PCA_Eigenvalues_mqc.txt
- PCA_First2axis_mqc.yaml
- Mean_Pairwise_Pop_FST_mqc.csv
- IBS_mqc.png
outputs:
fastqc:
fastqc_SE:
- name: html
file: '{sample}_fastqc.html'
description: Rapport html fastqc
- name: zip
file: '{sample}_fastqc.zip'
description: Dossier zip fastqc
fastqc_PE:
- name: html1
file: '{sample}_R1_fastqc.html'
description: Rapport html fastqc
- name: zip1
file: '{sample}_R1_fastqc.zip'
description: Dossier zip fastqc
- name: html2
file: '{sample}_R2_fastqc.html'
description: Rapport html fastqc
- name: zip2
file: '{sample}_R2_fastqc.zip'
description: Dossier zip fastqc
'null':
'null': []
process_radtags:
process_radtags_SE:
- name: reads_demultiplexed
file: '{individu}.fastq.gz'
description: Files of reads for each individual
process_radtags_PE:
- name: reads_forward_demultiplexed
file: '{individu}_R1.fastq.gz'
description: Files of forward reads for each individual
- name: reads_reverse_demultiplexed
file: '{individu}_R2.fastq.gz'
description: Files of reverse reads for each individual
bowtie:
bowtie_index:
- name: index
file: index
description: Index files for bowtie alignment
bowtie_PE:
- name: bam
file: '{sample}.bam'
description: Alignment files
bowtie_SE:
- name: bam
file: '{sample}.bam'
description: Alignment files
bwa:
bwa_index:
- name: index
file: index
description: Index files for bwa alignment
bwa_mem_PE:
- name: bam
file: '{sample}.bam'
description: Alignment files
bwa_mem_SE:
- name: bam
file: '{sample}.bam'
description: Alignment files
samtools_stats:
samtools_stats:
- name: stats
file: '{sample}_samtools_stats.txt'
description: samtools stats command output
gstacks:
gstacks:
- name: catalog_calls
file: catalog.calls
description: Consensus sequence for each assembled locus in the data
- name: catalog_fa
file: catalog.fa.gz
description: Custom file that contains genotyping data
- name: stats
file: gstacks.log.distribs
description: Distributions statistics (bam stats, coverage and phasing_rates
per sample)
populations:
populations:
- name: fstats_pop
file: populations.fst_{popA}-{popB}.tsv
description: SNP and haplotype-based F statistics between each populations
- name: fstats_summary
file: populations.fst_summary.tsv
description: SNP and haplotype-based F statistics summary
- name: haplotypes_tab
file: populations.haplotypes.tsv
description: Haplotypes for each sample
- name: phistats_pop
file: populations.phistats_{popA}-{popB}.tsv
description: Phi statistics between each populations
- name: phistats_summary
file: populations.phistats_summary.tsv
description: Phi statistics summary
- name: vcf
file: populations.snps.vcf
description: SNP vcf file
- name: structure
file: populations.structure
description: Output in Structure format
- name: genepop_haplotypes
file: populations.haps.genepop
description: Haplotypes in GenePop format
- name: genepop_snps
file: populations.snps.genepop
description: SNP in GenePop format
- name: plink_map
file: populations.plink.map
description: Genotypes in PLINK format
- name: plink_ped
file: populations.plink.ped
description: Genotypes in PLINK format
- name: radpainter_haplotypes
file: populations.haps.radpainter
description: Haplotypes fineRADstructure/RADpainter format
- name: PCA_Eigenvalues_mqc
file: PCA_Eigenvalues_mqc.txt
description: PCA eigenvalues
- name: PCA_First2axis_mqc
file: PCA_First2axis_mqc.yaml
description: PCA plot on first 2 axis
- name: Mean_Pairwise_Pop_FST_mqc
file: Mean_Pairwise_Pop_FST_mqc.csv
description: Mean pairwise population Fstats
- name: IBS_mqc
file: IBS_mqc.png
description: IBS heatmap
multiqc:
fastqc: fastqc
'null': custom
process_radtags: custom
bowtie: bowtie1
bwa: custom
samtools_stats: samtools
gstacks: custom
populations: stacks
library(SNPRelate)
library(RColorBrewer)
library(gplots)
library(yaml)
palette = "Accent" #"Spectral"
args = commandArgs(trailingOnly=TRUE)
parameters = read_yaml(args[1])$params
vcf.fn = paste(parameters$results_dir,parameters$populations_output_dir,"populations.snps.vcf.gz",sep = "/")
#vcf.fn = "/results/populations/populations.snps.vcf"
popmap=read.table(parameters$gstacks_population_tsv, header=F, sep='\t', stringsAsFactors=F)
#popmap=read.table("/samples/AgneseRAD/ran_pop_map.txt",header=F, sep='\t', stringsAsFactors=F)
snpgdsVCF2GDS(vcf.fn,"/tmp/batch_1.gds", method="biallelic.only", verbose=FALSE)
genofile <- snpgdsOpen("/tmp/batch_1.gds")
PCA1 <- snpgdsPCA(genofile, snp.id=NULL, maf=NaN, missing.rate=0.2, num.thread=10, verbose=FALSE, autosome.only=FALSE)
#PCA eigenvalues
fic = paste(parameters$results_dir,parameters$populations_output_dir,'PCA_Eigenvalues_mqc.txt',sep = "/")
cat("# id: custom_bargraph_tsv
# section_name: 'PCA eigenvalues'
# description: 'valeurs propres (variance expliquée par chaque axe).'
# format: 'tsv'