Commit 2c5be00e authored by mmassaviol's avatar mmassaviol
Browse files

Big update

use new by_step version of waw
add steps (double samtools stats and sam_to_phylip)
parent 7a921950
......@@ -49,6 +49,15 @@ RUN cd /opt/biotools \
&& mv samtools-1.9/samtools bin/samtools \
&& rm -r samtools-1.9 samtools-1.9.tar.bz2
RUN cd /opt/biotools \
&& wget ftp://emboss.open-bio.org/pub/EMBOSS/EMBOSS-6.6.0.tar.gz \
&& tar -xvzf EMBOSS-6.6.0.tar.gz \
&& cd EMBOSS-6.6.0 \
&& ./configure --without-x \
&& make -j 8 \
&& cd ../bin && ln -s /opt/biotools/EMBOSS-6.6.0/emboss/seqret seqret \
&& rm ../EMBOSS-6.6.0.tar.gz
ENV LANG en_US.UTF-8
ENV LANGUAGE en_US:en
ENV LC_ALL en_US.UTF-8
......
This diff is collapsed.
......@@ -46,3 +46,15 @@ lastz:
\ Marth, Goncalo Abecasis, Richard Durbin, 1000 Genome Project Data Processing\
\ Subgroup, The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume\
\ 25, Issue 16, 15 August 2009, Pages 2078\u20132079, https://doi.org/10.1093/bioinformatics/btp352"
samtools_stats:
samtools:
- "Heng Li, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor\
\ Marth, Goncalo Abecasis, Richard Durbin, 1000 Genome Project Data Processing\
\ Subgroup, The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume\
\ 25, Issue 16, 15 August 2009, Pages 2078\u20132079, https://doi.org/10.1093/bioinformatics/btp352"
sam_to_phylip:
samtools:
- "Heng Li, Bob Handsaker, Alec Wysoker, Tim Fennell, Jue Ruan, Nils Homer, Gabor\
\ Marth, Goncalo Abecasis, Richard Durbin, 1000 Genome Project Data Processing\
\ Subgroup, The Sequence Alignment/Map format and SAMtools, Bioinformatics, Volume\
\ 25, Issue 16, 15 August 2009, Pages 2078\u20132079, https://doi.org/10.1093/bioinformatics/btp352"
......@@ -21,9 +21,22 @@ def files_or_dirs_to_ignore():
return res
def module_order():
res = "module_order:\n"
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " - " + config["multiqc"][tool] + ":\n"
res += " name: " + step["title"] + " (" + tool + ")\n"
res += " anchor: " + step["name"] + "__" + config["multiqc"][tool] + "\n"
res += " path_filters:\n"
for rule in config["outputs"][step["name"] + "__" + tool].keys():
res += " - '*" + config["params"][rule + "_output_dir"] + "/*'\n" # limit search to tool output dir
res += " - '*/logs/" + config["params"][rule + "_output_dir"] + "/*'\n" # and tool logs
return res
def report_section_order():
res = "skip_generalstats: true\n\n"
res += "report_section_order:\n"
res = "report_section_order:\n"
res += " Rule_graph:\n"
res += " order: 990\n"
res += " params_tab:\n"
......@@ -38,30 +51,40 @@ def report_section_order():
for step in config["steps"]:
tool = config["params"][step["name"]]
if (config["multiqc"][tool] != "custom"):
res += " " + config["multiqc"][tool] + ":\n"
res += " " + step["name"] + "__" + config["multiqc"][tool] + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
for rule in config["outputs"][tool]:
for rule in config["outputs"][step["name"] + "__" + tool]:
if ("SeOrPe" not in config.keys() or (config["params"]["SeOrPe"] == "SE" and not("_PE" in rule)) or (config["params"]["SeOrPe"] == "PE" and not("_SE" in rule))):
for output in config["outputs"][tool][rule]:
for output in config["outputs"][step["name"] + "__" + tool][rule]:
if("file" in output.keys() and "mqc" in output["file"] and '{' not in output["file"]): # case of dynamic files ({wildcard}_mqc.png) to deal with
section = re.sub('\_mqc.*$', '', output["file"])
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
if step["name"] + "__" + tool in config["prepare_report_outputs"]:
if isinstance(config["prepare_report_outputs"][step["name"] + "__" + tool], list):
for output in config["prepare_report_outputs"][step["name"] + "__" + tool]:
section = re.sub('\_mqc.*$', '', output)
res += " " + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
else:
section = re.sub('\_mqc.*$', '', config["prepare_report_outputs"][step["name"] + "__" + tool])
res += " " + step["name"] + "__" + section + ":\n"
res += " " + "order: " + str(cpt) + "\n"
cpt += -10
return res
def main():
res = ""
res += report_section_order()
res = "skip_generalstats: true\n\n"
res += module_order() + "\n\n"
res += report_section_order() + "\n\n"
res += files_or_dirs_to_ignore()
res += "\nremove_sections:\n"
res += " - 'flash-bargraph'\n"
with open(sys.argv[2],"w") as out:
out.write(res)
if __name__ == "__main__":
# execute only if run as a script
main()
main()
\ No newline at end of file
pipeline: Capture_UCE
params:
results_dir: /Results
sample_dir: /Data
SeOrPe: PE
quality_control: fastqc
fastqc_SE_output_dir: fastqc_SE
fastqc_SE_command: fastqc
fastqc_PE_output_dir: fastqc_PE
fastqc_PE_command: fastqc
quality_control__fastqc_SE_output_dir: quality_control/fastqc_SE
quality_control__fastqc_SE_command: fastqc
quality_control__fastqc_PE_output_dir: quality_control/fastqc_PE
quality_control__fastqc_PE_command: fastqc
qcfilter_adaptertrim: trimmomatic
trimmomatic_PE_output_dir: trimmomatic_PE
trimmomatic_PE_command: java -jar /opt/biotools/Trimmomatic-0.38/trimmomatic-0.38.jar
qcfilter_adaptertrim__trimmomatic_PE_output_dir: qcfilter_adaptertrim/trimmomatic_PE
qcfilter_adaptertrim__trimmomatic_PE_command: java -jar /opt/biotools/Trimmomatic-0.38/trimmomatic-0.38.jar
PE
trimmomatic_threads: 4
trimmomatic_qc_score: -phred64
trimmomatic_fastaWithAdapters_select: server
trimmomatic_fastaWithAdapters: ''
trimmomatic_illuminaclip: ''
trimmomatic_otherparams: ''
trimmomatic_SE_output_dir: trimmomatic_SE
trimmomatic_SE_command: java -jar /opt/biotools/Trimmomatic-0.38/trimmomatic-0.38.jar
qcfilter_adaptertrim__trimmomatic_threads: 4
qcfilter_adaptertrim__trimmomatic_qc_score: -phred64
qcfilter_adaptertrim__trimmomatic_fastaWithAdapters_select: server
qcfilter_adaptertrim__trimmomatic_fastaWithAdapters: ''
qcfilter_adaptertrim__trimmomatic_illuminaclip: ''
qcfilter_adaptertrim__trimmomatic_otherparams: ''
qcfilter_adaptertrim__trimmomatic_SE_output_dir: qcfilter_adaptertrim/trimmomatic_SE
qcfilter_adaptertrim__trimmomatic_SE_command: java -jar /opt/biotools/Trimmomatic-0.38/trimmomatic-0.38.jar
SE
merge_overlapps: flash
flash_output_dir: flash
flash_command: flash
flash_threads: 4
flash_min_overlap: 10
flash_max_overlap: 65
flash_max_mismatch_density: 0.25
merge_overlapps__flash_output_dir: merge_overlapps/flash
merge_overlapps__flash_command: flash
merge_overlapps__flash_threads: 4
merge_overlapps__flash_min_overlap: 10
merge_overlapps__flash_max_overlap: 65
merge_overlapps__flash_max_mismatch_density: 0.25
demultiplexing: demultiplexing_astrid_cruaud
demultiplexing_astrid_cruaud_output_dir: demultiplexing_astrid_cruaud
demultiplexing_astrid_cruaud_command: ''
demultiplexing_astrid_cruaud_threads: 4
demultiplexing_astrid_cruaud_barcodes_select: server
demultiplexing_astrid_cruaud_barcodes: ''
demultiplexing__demultiplexing_astrid_cruaud_output_dir: demultiplexing/demultiplexing_astrid_cruaud
demultiplexing__demultiplexing_astrid_cruaud_command: ''
demultiplexing__demultiplexing_astrid_cruaud_threads: 4
demultiplexing__demultiplexing_astrid_cruaud_barcodes_select: server
demultiplexing__demultiplexing_astrid_cruaud_barcodes: ''
assembling: velvet
velvet_output_dir: velvet
velvet_command: velvet
velvet_hash_length: '31'
velvet_min_contig_lgth: NA
assembling__velvet_output_dir: assembling/velvet
assembling__velvet_command: velvet
assembling__velvet_hash_length: '31'
assembling__velvet_min_contig_lgth: NA
contigmaptouce: lastz
lastz_output_dir: lastz
lastz_command: lastz
lastz_reference_fasta_select: server
lastz_reference_fasta: ''
samples: []
groups: []
contigmaptouce__lastz_output_dir: contigmaptouce/lastz
contigmaptouce__lastz_command: lastz
contigmaptouce__lastz_reference_fasta_select: server
contigmaptouce__lastz_reference_fasta: ''
mapping_check_bf: samtools_stats
mapping_check_bf__samtools_stats_output_dir: mapping_check_bf/samtools_stats
mapping_check_bf__samtools_stats_command: samtools stats
mapping_check_bf__samtools_stats_threads: 4
mapping_check_af: samtools_stats
mapping_check_af__samtools_stats_output_dir: mapping_check_af/samtools_stats
mapping_check_af__samtools_stats_command: samtools stats
mapping_check_af__samtools_stats_threads: 4
convert_to_phylip: sam_to_phylip
convert_to_phylip__sam_to_phylip_output_dir: convert_to_phylip/sam_to_phylip
convert_to_phylip__sam_to_phylip_command: ''
steps:
- title: Quality control
name: quality_control
- name: quality_control
title: Quality control
tools:
- fastqc
default: fastqc
- title: Quality filtering and adapter trimming
name: qcfilter_adaptertrim
- name: qcfilter_adaptertrim
title: Quality filtering and adapter trimming
tools:
- trimmomatic
default: trimmomatic
- title: Merge of overlapping reads
name: merge_overlapps
- name: merge_overlapps
title: Merge of overlapping reads
tools:
- flash
default: flash
- title: Demultiplexing
name: demultiplexing
- name: demultiplexing
title: Demultiplexing
tools:
- demultiplexing_astrid_cruaud
default: demultiplexing_astrid_cruaud
- title: Assembling
name: assembling
- name: assembling
title: Assembling
tools:
- velvet
default: velvet
- title: Contigs map to UCEs references
name: contigmaptouce
- name: contigmaptouce
title: Contigs map to UCEs references
tools:
- lastz
default: lastz
- name: mapping_check_bf
title: Mapping check before filter
tools:
- samtools_stats
default: samtools_stats
- name: mapping_check_af
title: Mapping check after filter
tools:
- samtools_stats
default: samtools_stats
- name: convert_to_phylip
title: Convert to phylip
tools:
- sam_to_phylip
default: sam_to_phylip
params_info:
results_dir:
type: output_dir
......@@ -82,115 +108,122 @@ params_info:
type: input_dir
SeOrPe:
type: radio
trimmomatic_threads:
qcfilter_adaptertrim__trimmomatic_threads:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: numeric
label: Number of threads to use
trimmomatic_qc_score:
qcfilter_adaptertrim__trimmomatic_qc_score:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: radio
label: Quality score encoding
trimmomatic_fastaWithAdapters_select:
qcfilter_adaptertrim__trimmomatic_fastaWithAdapters_select:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: select
trimmomatic_fastaWithAdapters:
qcfilter_adaptertrim__trimmomatic_fastaWithAdapters:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: input_file
label: (optional) fastaWithAdapters file for ILLUMINACLIP parameter
trimmomatic_illuminaclip:
qcfilter_adaptertrim__trimmomatic_illuminaclip:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: text
label: (optional) See ILLUMINACLIP in documentation <seed mismatches>:<palindrome
clip threshold>:<simple clip threshold>:<minAdapterLength>:<keepBothReads>
trimmomatic_otherparams:
qcfilter_adaptertrim__trimmomatic_otherparams:
tool: trimmomatic
rule: trimmomatic_SE
rule: qcfilter_adaptertrim_trimmomatic_SE
type: text
label: (optional) See documentation for other trimmomatic parameters (LEADING,
TRAILING, MINLEN, ...)
flash_threads:
merge_overlapps__flash_threads:
tool: flash
rule: flash
rule: merge_overlapps_flash
type: numeric
label: Number of threads to use
flash_min_overlap:
merge_overlapps__flash_min_overlap:
tool: flash
rule: flash
rule: merge_overlapps_flash
type: numeric
label: '-m : The minimum required overlap length between two reads to provide
a confident overlap'
flash_max_overlap:
merge_overlapps__flash_max_overlap:
tool: flash
rule: flash
rule: merge_overlapps_flash
type: numeric
label: '-M : Maximum overlap length expected in approximately 90% of read pairs'
flash_max_mismatch_density:
merge_overlapps__flash_max_mismatch_density:
tool: flash
rule: flash
rule: merge_overlapps_flash
type: numeric
label: '-x : Maximum allowed ratio between the number of mismatched base pairs
and the overlap length'
demultiplexing_astrid_cruaud_threads:
demultiplexing__demultiplexing_astrid_cruaud_threads:
tool: demultiplexing_astrid_cruaud
rule: demultiplexing_astrid_cruaud
rule: demultiplexing_demultiplexing_astrid_cruaud
type: numeric
label: Threads to use
demultiplexing_astrid_cruaud_barcodes_select:
demultiplexing__demultiplexing_astrid_cruaud_barcodes_select:
tool: demultiplexing_astrid_cruaud
rule: demultiplexing_astrid_cruaud
rule: demultiplexing_demultiplexing_astrid_cruaud
type: select
demultiplexing_astrid_cruaud_barcodes:
demultiplexing__demultiplexing_astrid_cruaud_barcodes:
tool: demultiplexing_astrid_cruaud
rule: demultiplexing_astrid_cruaud
rule: demultiplexing_demultiplexing_astrid_cruaud
type: input_file
label: Barcode file
velvet_hash_length:
assembling__velvet_hash_length:
tool: velvet
rule: velvet
rule: assembling_velvet
type: text
label: 'hash_length : EITHER an odd integer (if even, it will be decremented)
<= 31 (if above, will be reduced) OR: m,M,s where m and M are odd integers (if
not, they will be decremented) with m < M <= 31 (if above, will be reduced)
and s is a step (even number). Velvet will then hash from k=m to k=M with a
step of s'
velvet_min_contig_lgth:
assembling__velvet_min_contig_lgth:
tool: velvet
rule: velvet
rule: assembling_velvet
type: numeric
label: '-min_contig_lgth : minimum contig length exported to contigs.fa file (default:
hash length * 2)'
lastz_reference_fasta_select:
contigmaptouce__lastz_reference_fasta_select:
tool: lastz
rule: lastz
rule: contigmaptouce_lastz
type: select
lastz_reference_fasta:
contigmaptouce__lastz_reference_fasta:
tool: lastz
rule: lastz
rule: contigmaptouce_lastz
type: input_file
label: Path to reference fasta file
mapping_check_bf__samtools_stats_threads:
tool: samtools_stats
rule: mapping_check_bf_samtools_stats
type: numeric
label: Number of threads to use
mapping_check_af__samtools_stats_threads:
tool: samtools_stats
rule: mapping_check_af_samtools_stats
type: numeric
label: Number of threads to use
prepare_report_scripts:
- flash.prepare.report.py
- lastz.prepare.report.py
- contigmaptouce__lastz.prepare.report.py
prepare_report_outputs:
flash:
- flash_combinations_mqc.csv
lastz:
contigmaptouce__lastz:
- lastz_stats_mqc.csv
outputs:
fastqc:
fastqc_SE:
quality_control__fastqc:
quality_control__fastqc_SE:
- name: html
file: '{sample}_fastqc.html'
description: Rapport html fastqc
- name: zip
file: '{sample}_fastqc.zip'
description: Dossier zip fastqc
fastqc_PE:
quality_control__fastqc_PE:
- name: html1
file: '{sample}_R1_fastqc.html'
description: Rapport html fastqc
......@@ -203,8 +236,8 @@ outputs:
- name: zip2
file: '{sample}_R2_fastqc.zip'
description: Dossier zip fastqc
trimmomatic:
trimmomatic_PE:
qcfilter_adaptertrim__trimmomatic:
qcfilter_adaptertrim__trimmomatic_PE:
- name: readFP
file: '{sample}_forward_paired.fq.gz'
description: Reads forward paired
......@@ -217,12 +250,12 @@ outputs:
- name: readRU
file: '{sample}_reverse_unpaired.fq.gz'
description: Reads reverse unpaired
trimmomatic_SE:
qcfilter_adaptertrim__trimmomatic_SE:
- name: read
file: '{sample}_trimmed.fq.gz'
description: Reads trimmed
flash:
flash:
merge_overlapps__flash:
merge_overlapps__flash:
- name: extendedFrags
file: '{sample}.extendedFrags.fastq'
description: The merged reads.
......@@ -238,13 +271,13 @@ outputs:
- name: histogram
file: '{sample}.histogram'
description: Visual histogram of merged read lengths.
demultiplexing_astrid_cruaud:
demultiplexing_astrid_cruaud:
demultiplexing__demultiplexing_astrid_cruaud:
demultiplexing__demultiplexing_astrid_cruaud:
- name: demultiplexed
file: '{sample}_fq'
description: The demultiplexed reads.
velvet:
velvet:
assembling__velvet:
assembling__velvet:
- name: contigs
file: '{sample}_contigs.fa'
description: fasta file of contigs longer than twice hash length
......@@ -255,15 +288,33 @@ outputs:
- name: LastGraph
file: '{sample}_LastGraph'
description: special formatted file with all the information on the final graph
lastz:
lastz:
contigmaptouce__lastz:
contigmaptouce__lastz:
- name: align_on_ref
file: '{sample}.vs_probes_default.sam'
description: Contigs mapping on reference
mapping_check_bf__samtools_stats:
mapping_check_bf__samtools_stats:
- name: stats
file: '{sample}_samtools_stats.txt'
description: samtools stats command output
mapping_check_af__samtools_stats:
mapping_check_af__samtools_stats:
- name: stats
file: '{sample}_samtools_stats.txt'
description: samtools stats command output
convert_to_phylip__sam_to_phylip:
convert_to_phylip__sam_to_phylip:
- name: phylips
file: '{}.phylip'
description: phylips files
multiqc:
fastqc: fastqc
trimmomatic: trimmomatic
flash: custom
flash: flash
demultiplexing_astrid_cruaud: custom
velvet: custom
lastz: custom
samtools_stats: samtools
sam_to_phylip: custom
stop_cases: {}
......@@ -23,10 +23,10 @@ out ="""# description: 'LASTZ mapping statistics'
#out += '\t'.join(['sample','totalpairs','discardpairs','%discard','combopairs','inniepairs','outiepairs','uncombopairs','%combo']) + '\n'
header = ""
stats = ""
for file in os.listdir(os.path.join(config["results_dir"],config["lastz_output_dir"],"stats")):
for file in os.listdir(os.path.join(config["results_dir"],config["contigmaptouce__lastz_output_dir"],"stats")):
stat = ""
sample = get_sample(file)
with open(os.path.join(config["results_dir"],config["lastz_output_dir"],"stats",file), 'r') as logfile:
with open(os.path.join(config["results_dir"],config["contigmaptouce__lastz_output_dir"],"stats",file), 'r') as logfile:
header = logfile.readline()
stat = logfile.readline()
stats += sample + '\t' + stat
......@@ -34,5 +34,5 @@ for file in os.listdir(os.path.join(config["results_dir"],config["lastz_output_d
header = "SAMPLE\t" + header
out += header + stats
with open(os.path.join(config["results_dir"],config["lastz_output_dir"],"lastz_stats_mqc.csv"), "w") as outfile:
with open(os.path.join(config["results_dir"],config["contigmaptouce__lastz_output_dir"],"lastz_stats_mqc.csv"), "w") as outfile:
outfile.write(out)
import oyaml as yaml
import sys
import re
import os
from collections import OrderedDict
config = dict()
with open(sys.argv[1], 'r') as paramfile:
config = yaml.load(paramfile, Loader=yaml.FullLoader)
config = config["params"]
def get_field(field, slog, fl=False):
"""parse sample log for field
set fl=True to return a float
otherwise, returns int
"""
field += r'\:\s+([\d\.]+)'
match = re.search(field, slog)
if match:
if fl:
return float(match.group(1))
return int(match.group(1))
return 0
def get_sample(logfile):
if "_flash_log.txt" in logfile:
return os.path.basename(logfile).replace("_flash_log.txt","")
else:
return "unknown"
out ="""# description: 'FLASh read combination statistics'
# section_name: 'FLASh combinations'
# plot_type: 'table'
"""
out += '\t'.join(['sample','totalpairs','discardpairs','%discard','combopairs','inniepairs','outiepairs','uncombopairs','%combo']) + '\n'
for file in os.listdir(os.path.join(config["results_dir"],"logs/flash")):
slog = ""
res = OrderedDict()
sample = get_sample(file)
with open(os.path.join(config["results_dir"],"logs/flash",file), 'r') as logfile:
slog = "".join(logfile.readlines())
res['totalpairs'] = get_field('Total pairs', slog)
res['discardpairs'] = get_field('Discarded pairs', slog)
res['percdiscard'] = get_field('Percent Discarded', slog, fl=True)
res['combopairs'] = get_field('Combined pairs', slog)
res['inniepairs'] = get_field('Innie pairs', slog)
res['outiepairs'] = get_field('Outie pairs', slog)
res['uncombopairs'] = get_field('Uncombined pairs', slog)
res['perccombo'] = get_field('Percent combined', slog, fl=True)
out += sample + '\t' + '\t'.join(str(x) for x in res.values()) + '\n'
with open(os.path.join(config["results_dir"],"flash/flash_combinations_mqc.csv"), "w") as outfile:
outfile.write(out)
\ No newline at end of file
base_tools:
snakemake: 5.9.1
multiqc: 1.8
multiqc: 1.9
fastqc: 0.11.5
trimmomatic: '0.38'
flash: 1.2.11
demultiplexing_astrid_cruaud: ''
velvet: 1.2.10
lastz: 1.04.03
samtools_stats: '1.9'
sam_to_phylip: ''