Commit 3be67b57 authored by peguerin's avatar peguerin
Browse files

Merge branch 'master' of gitlab.mbb.univ-montp2.fr:edna/snakemake_only_obitools

parents 0d08d7e2 c6529eb7
### dereplicate reads into uniq sequences
rule dereplicate_samples:
input:
'01-runs/{run}/{sample}.fasta'
'01-raw/{run}/{sample}.fasta'
output:
'02-filtered/{run}/{sample}.uniq.fasta'
singularity:
......
configfile: "config.yaml"
RUNS, = glob_wildcards('raw/{run}_R1.fastq.gz')
BARCODES, = glob_wildcards('barcodes/{barcode}.dat')
DICBARCODES={}
i=0
for bc in BARCODES:
DICBARCODES[RUNS[i]]="barcodes/"+bc+".dat"
i=i+1
#print(DICBARCODES)
rule all:
input:
expand('assembled/{run}/{run}.fastq', run=RUNS),
expand('assembled/{run}/{run}.ali.fastq', run=RUNS),
expand('assembled/{run}/{run}.ali.assigned.fastq', run=RUNS),
expand('assembled/{run}/{run}.unidentified.fastq', run=RUNS),
expand('log/remove_unaligned/{run}.log',run=RUNS),
expand('log/illuminapairedend/{run}.log',run=RUNS),
expand('log/assign_sequences/{run}.log',run=RUNS),
expand('log/split_sequences/{run}.log',run=RUNS)
### Paired end alignment then keep reads with quality > 40
rule illuminapairedend:
input:
R1='raw/{run}_R1.fastq.gz',
R2='raw/{run}_R2.fastq.gz'
output:
fq='assembled/{run}/{run}.fastq'
log:
'log/illuminapairedend/{run}.log'
params:
s_min=config["illuminapairedend"]["s_min"]
shell:
'''illuminapairedend -r {input.R2} {input.R1} --score-min={params.s_min} > {output.fq} 2> {log}'''
### Remove unaligned sequence records
rule remove_unaligned:
input:
fq='assembled/{run}/{run}.fastq'
output:
ali='assembled/{run}/{run}.ali.fastq'
log:
'log/remove_unaligned/{run}.log'
shell:
'''obigrep -p 'mode!=\"joined\"' {input.fq} > {output.ali} 2> {log}'''
### Assign each sequence record to the corresponding sample/marker combination
rule assign_sequences:
input:
'assembled/{run}/{run}.ali.fastq',
lambda wildcards: DICBARCODES[wildcards.run]
output:
assign='assembled/{run}/{run}.ali.assigned.fastq',
unid='assembled/{run}/{run}.unidentified.fastq'
log:
'log/assign_sequences/{run}.log'
shell:
'''ngsfilter -t {input[1]} -u {output.unid} {input[0]} --fasta-output > {output.assign} 2> {log}'''
### Split the input sequence file in a set of subfiles according to the values of attribute `sample`
rule split_sequences:
input:
'assembled/{run}/{run}.ali.assigned.fastq'
params:
'samples/{run}_sample_'
log:
'log/split_sequences/{run}.log'
shell:
'''obisplit -p "{params}" -t sample --fasta {input} 2> {log}'''
configfile: "config.yaml"
SAMPLES, = glob_wildcards('samples/{sample}.fasta')
rule all:
input:
expand('samples/{sample}.uniq.fasta',sample=SAMPLES),
expand('samples/{sample}.l.u.fasta',sample=SAMPLES),
expand('samples/{sample}.r.l.u.fasta',sample=SAMPLES),
expand('samples/{sample}.c.r.l.u.fasta',sample=SAMPLES),
expand('log/dereplicate_samples/{sample}.log',sample=SAMPLES),
expand('log/goodlength_samples/{sample}.log',sample=SAMPLES),
expand('log/clean_pcrerr/{sample}.log',sample=SAMPLES),
expand('log/rm_internal_samples/{sample}.log',sample=SAMPLES)
### dereplicate reads into uniq sequences
rule dereplicate_samples:
input:
'samples/{sample}.fasta'
output:
'samples/{sample}.uniq.fasta'
log:
'log/dereplicate_samples/{sample}.log'
shell:
'''obiuniq -m sample {input} > {output} 2> {log}'''
### only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
rule goodlength_samples:
input:
'samples/{sample}.uniq.fasta'
output:
'samples/{sample}.l.u.fasta'
log:
'log/goodlength_samples/{sample}.log'
params:
count=config["good_length_samples"]["count"],
seq_length=config["good_length_samples"]["seq_length"]
shell:
'''obigrep -p 'count>{params.count}' -s '^[ACGT]+$' -p 'seq_length>{params.seq_length}' {input} > {output} 2> {log}'''
### Clean the sequences for PCR/sequencing errors (sequence variants)
rule clean_pcrerr_samples:
input:
'samples/{sample}.l.u.fasta'
output:
'samples/{sample}.r.l.u.fasta'
log:
'log/clean_pcrerr/{sample}.log'
params:
r=config["clean_pcrerr_samples"]["r"]
shell:
'''obiclean -r {params.r} {input} > {output} 2> {log}'''
### Remove sequence which are classified as 'internal' by obiclean
rule rm_internal_samples:
input:
'samples/{sample}.r.l.u.fasta'
output:
'samples/{sample}.c.r.l.u.fasta'
log:
'log/rm_internal_samples/{sample}.log'
shell:
'''obigrep -p 'obiclean_internalcount == 0' {input} > {output} 2> {log}'''
RUNS, = glob_wildcards('raw/{run}_R1.fastq.gz')
rule all:
input:
expand('runs/{run}_run.fasta',run=RUNS)
### Concatenate sequences from each sample of the same run
rule cat_samples:
params:
'samples/{run}*.c.r.l.u.fasta'
output:
'runs/{run}_run.fasta'
shell:
'''cat {params} > {output}'''
\ No newline at end of file
configfile: "config.yaml"
RUNS, = glob_wildcards('raw/{run}_R1.fastq.gz')
rule all:
input:
expand('runs/{run}_run.fasta',run=RUNS),
expand('runs/{run}_run.uniq.fasta', run=RUNS),
expand('runs/{run}_run.tag.u.fasta', run=RUNS),
expand('runs/{run}_run.a.t.u.fasta', run=RUNS),
expand('runs/{run}_run.s.a.t.u.fasta', run=RUNS),
expand('tables/{run}.csv', run=RUNS),
expand('log/dereplicate_runs/{run}.log', run=RUNS),
expand('log/assign_taxon/{run}.log', run=RUNS),
expand('log/rm_attributes/{run}.log', run=RUNS),
expand('log/sort_runs/{run}.log', run=RUNS),
expand('log/table_runs/{run}.log',run=RUNS)
### Dereplicate and merge samples together
rule dereplicate_runs:
input:
'runs/{run}_run.fasta'
output:
'runs/{run}_run.uniq.fasta'
log:
'log/dereplicate_runs/{run}.log'
shell:
'''obiuniq -m sample {input} > {output} 2> {log}'''
### Assign each sequence to a taxon
rule assign_taxon:
input:
'runs/{run}_run.uniq.fasta'
output:
'runs/{run}_run.tag.u.fasta'
params:
bdr=config["assign_taxon"]["bdr"],
fasta=config["assign_taxon"]["fasta"]
log:
'log/assign_taxon/{run}.log'
shell:
'''ecotag -d {params.bdr} -R {params.fasta} {input} > {output} 2> {log}'''
### Some unuseful attributes can be removed at this stage
rule rm_attributes:
input:
'runs/{run}_run.tag.u.fasta'
output:
'runs/{run}_run.a.t.u.fasta'
log:
'log/rm_attributes/{run}.log'
shell:
'''obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
--delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
--delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
--delete-tag=obiclean_head --delete-tag=obiclean_headcount \
--delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \
--delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
--delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
--delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
--delete-tag=tail_quality {input} > {output} 2> {log}'''
### The sequences can be sorted by decreasing order of count
rule sort_runs:
input:
'runs/{run}_run.a.t.u.fasta'
output:
'runs/{run}_run.s.a.t.u.fasta'
log:
'log/sort_runs/{run}.log'
shell:
'''obisort -k count -r {input} > {output} 2> {log}'''
### Generate a table final results
rule table_runs:
input:
'runs/{run}_run.s.a.t.u.fasta'
output:
'tables/{run}.csv'
log:
'log/table_runs/{run}.log'
shell:
'''obitab -o {input} > {output} 2> {log}'''
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
configfile: "../config.yaml"
#configfile: "../config.yaml"
rule all:
......
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
configfile: "../config.yaml"
#configfile: "../config.yaml"
(RUNS,SAMPLES) = glob_wildcards('01-raw/{run}/{sample}.fasta')
......
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
configfile: "../config.yaml"
#configfile: "../config.yaml"
RUNS, = glob_wildcards('01-runs/{run}_run.fasta')
rule all:
......
......@@ -4,7 +4,7 @@ Metabarcoding Only_obitools workflow using SNAKEMAKE
[![https://www.singularity-hub.org/static/simg/hosted-singularity--hub-%23e32929.svg](https://www.singularity-hub.org/static/img/hosted-singularity--hub-%23e32929.svg)](https://singularity-hub.org/collections/2878)
**Pierre-Edouard Guerin, 2019**
**Virginie Marques, Pierre-Edouard Guerin, 2019**
_________________________________
......
__author__ = "Pierre-Edouard Guerin"
__license__ = "MIT"
configfile: "config.yaml"
rule all:
input:
expand("{folder}{run}_R1.fastq.gz", run=config["fastqFiles"],folder=config["fastqFolderPath"]),
expand('01-assembly/{run}/{run}.fastq', run=config["fastqFiles"]),
expand('01-assembly/{run}/{run}.ali.fastq', run=config["fastqFiles"]),
expand('01-assembly/{run}/{run}.ali.assigned.fastq', run=config["fastqFiles"]),
expand('01-assembly/{run}/{run}.unidentified.fastq', run=config["fastqFiles"]),
filter_sample(expand('02-demultiplex/02-03-cleaned/{run}*.c.r.l.u.fasta',run=config["fastqFiles"])),
expand('03-filtered/{run}_run.fasta',run=config["fastqFiles"]),
expand('99-log/remove_unaligned/{run}.log',run=config["fastqFiles"]),
expand('99-log/illuminapairedend/{run}.log',run=config["fastqFiles"]),
expand('99-log/assign_sequences/{run}.log',run=config["fastqFiles"]),
expand('99-log/split_sequences/{run}.log',run=config["fastqFiles"])
include: "00-rules/assembly.smk"
include: "00-rules/demultiplex.smk"
include: "02-demultiplex/Snakefile"
subworkflow filter_sample:
workdir:
"."
snakefile:
"02-demultiplex/Snakefile"
configfile:
"config.yaml"
SAMPLES, = glob_wildcards('02-demultiplex/02-03-cleaned/{sample}.c.r.l.u.fasta')
RUNS, = glob_wildcards('01-assembly/{run}_R1.fastq.gz')
### Concatenate sequences from each sample of the same run
rule cat_samples:
input:
filter_sample(expand('02-demultiplex/02-03-cleaned/{run}*.c.r.l.u.fasta',run=config["fastqFiles"]))
output:
'runs/{run}_run.fasta'
shell:
'''cat {input} > {output}'''
\ No newline at end of file
......@@ -14,21 +14,26 @@
###############################################################################
## Usage:
## CORE=16
## bash main.sh $CORES
## CONFIG_FILE="config.yaml"
## bash main.sh $CORES $CONFIG_FILE
##
##
###############################################################################
CORES=$1
CONFIGFILE=$2
#CORES=16
#CONFIGFILE="config.yaml"
###############################################################################
## assemble & demultiplex
cd 01-assembly
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk" --latency-wait 120
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk --home $HOME" --latency-wait 120 --configfile "../"$CONFIGFILE
cd ..
###############################################################################
## filter sequences
cd 02-demultiplex
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk" --latency-wait 120
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk --home $HOME" --latency-wait 120 --configfile "../"$CONFIGFILE
cd ..
###############################################################################
## concatenate samples into run
......@@ -39,7 +44,7 @@ done
## taxonomic assignation & format
cd 03-filtered
#snakemake -s Snakefile -j 8 --dry-run --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk" --latency-wait 120
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk" --latency-wait 120
snakemake -s Snakefile -j $CORES --use-singularity --singularity-args "--bind /media/superdisk:/media/superdisk --home $HOME" --latency-wait 120 --configfile "../"$CONFIGFILE
cd ..
###############################################################################
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment