step4.sf 2.8 KB
Newer Older
peguerin's avatar
peguerin committed
1
configfile: "config.yaml"
peguerin's avatar
peguerin committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
RUNS, = glob_wildcards('raw/{run}_R1.fastq.gz')

rule all:
    input:
        expand('runs/{run}_run.fasta',run=RUNS),
        expand('runs/{run}_run.uniq.fasta', run=RUNS),
        expand('runs/{run}_run.tag.u.fasta', run=RUNS),
        expand('runs/{run}_run.a.t.u.fasta', run=RUNS),
        expand('runs/{run}_run.s.a.t.u.fasta', run=RUNS),
        expand('tables/{run}.csv', run=RUNS),
        expand('log/dereplicate_runs/{run}.log', run=RUNS),
        expand('log/assign_taxon/{run}.log', run=RUNS),
        expand('log/rm_attributes/{run}.log', run=RUNS),
        expand('log/sort_runs/{run}.log', run=RUNS),
        expand('log/table_runs/{run}.log',run=RUNS)

### Dereplicate and merge samples together
rule dereplicate_runs:
    input:
        'runs/{run}_run.fasta'
    output:
         'runs/{run}_run.uniq.fasta'
    log:
         'log/dereplicate_runs/{run}.log'
    shell:
         '''obiuniq -m sample {input} > {output} 2> {log}'''

### Assign each sequence to a taxon
rule assign_taxon:
    input:
        'runs/{run}_run.uniq.fasta'
    output:
        'runs/{run}_run.tag.u.fasta'
    params:
peguerin's avatar
peguerin committed
36
37
        bdr=config["assign_taxon"]["bdr"]
        fasta=config["assign_taxon"]["fasta"]
peguerin's avatar
peguerin committed
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
    log:
        'log/assign_taxon/{run}.log'
    shell:
        '''ecotag -d {params.bdr} -R {params.fasta} {input} > {output} 2> {log}'''

### Some unuseful attributes can be removed at this stage
rule rm_attributes:
    input:
        'runs/{run}_run.tag.u.fasta'
    output:
        'runs/{run}_run.a.t.u.fasta'
    log:
        'log/rm_attributes/{run}.log'
    shell:
        '''obiannotate --delete-tag=scientific_name_by_db --delete-tag=obiclean_samplecount \
 --delete-tag=obiclean_count --delete-tag=obiclean_singletoncount \
 --delete-tag=obiclean_cluster --delete-tag=obiclean_internalcount \
 --delete-tag=obiclean_head  --delete-tag=obiclean_headcount \
 --delete-tag=id_status --delete-tag=rank_by_db --delete-tag=obiclean_status \
 --delete-tag=seq_length_ori --delete-tag=sminL --delete-tag=sminR \
 --delete-tag=reverse_score --delete-tag=reverse_primer --delete-tag=reverse_match --delete-tag=reverse_tag \
 --delete-tag=forward_tag --delete-tag=forward_score --delete-tag=forward_primer --delete-tag=forward_match \
 --delete-tag=tail_quality {input} > {output} 2> {log}'''

### The sequences can be sorted by decreasing order of count
rule sort_runs:
    input:
        'runs/{run}_run.a.t.u.fasta'
    output:
        'runs/{run}_run.s.a.t.u.fasta'
    log:
        'log/sort_runs/{run}.log'
    shell:
        '''obisort -k count -r {input} > {output} 2> {log}'''

### Generate a table final results
rule table_runs:
    input:
        'runs/{run}_run.s.a.t.u.fasta'
    output:
        'tables/{run}.csv'
    log:
        'log/table_runs/{run}.log'
    shell:
        '''obitab -o {input} > {output} 2> {log}'''