filter_samples.smk 2.5 KB
Newer Older
peguerin's avatar
peguerin committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
### dereplicate reads into uniq sequences
rule dereplicate_samples:
    input:
        '../03_demultiplex/02_raw/{demultiplexs}.fasta'
    output:
        '01_dereplicated/{demultiplexs}.uniq.fasta'
    singularity:
        config["singularity"]["obitools"]
    log:
        '../99_log/04_filter_samples/01_dereplicated/{demultiplexs}.log'
    params:
        dmulti= lambda wildcards: dfMultiChecked[dfMultiChecked.demultiplex == wildcards.demultiplexs].to_dict('records')[0],        
    shell:
        '''mkdir -p 01_dereplicated/{params.dmulti[projmarkrun]}; obiuniq -m sample {input} > {output} 2> {log}'''

### only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads
rule goodlength_samples:
    input:
        '01_dereplicated/{demultiplexs}.uniq.fasta'
    output:
        '02_goodlength/{demultiplexs}.l.u.fasta'
    singularity:
        config["singularity"]["obitools"]
    log:
        '../99_log/04_filter_samples/02_goodlength/{demultiplexs}.log'
    params:
peguerin's avatar
peguerin committed
27
        seq_count=config["good_length_samples"]["seq_count"],
peguerin's avatar
peguerin committed
28
29
        seq_length=config["good_length_samples"]["seq_length"]        
    shell:
peguerin's avatar
peguerin committed
30
        '''obigrep  -p 'count>{params.seq_count}' -s '^[ACGT]+$' -p 'seq_length>{params.seq_length}' {input} > {output} 2> {log}'''
peguerin's avatar
peguerin committed
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60

### Clean the sequences for PCR/sequencing errors (sequence variants)
rule clean_pcrerr_samples:
    input:
        '02_goodlength/{demultiplexs}.l.u.fasta'
    output:
        '03_clean_pcrerr/{demultiplexs}.r.l.u.fasta'
    singularity:
        config["singularity"]["obitools"]
    log:
        '../99_log/04_filter_samples/03_clean_pcrerr/{demultiplexs}.log'
    params:
         r=config["clean_pcrerr_samples"]["r"]         
    shell:
        '''if [[ -s {input} ]]; then obiclean -r {params.r} {input} > {output} 2> {log} ; else touch {output} 2> {log} ; fi'''

### Remove sequence which are classified as 'internal' by obiclean
rule rm_internal_samples:
    input:
        '03_clean_pcrerr/{demultiplexs}.r.l.u.fasta'
    output:
        '04_filtered/{demultiplexs}.c.r.l.u.fasta'
    params:
        dmulti= lambda wildcards: dfMultiChecked[dfMultiChecked.demultiplex == wildcards.demultiplexs].to_dict('records')[0],
    singularity:
        config["singularity"]["obitools"]
    log:
        '../99_log/04_filter_samples/04_filtered/{demultiplexs}.log'
    shell:
        '''if [[ -s {input} ]]; then mkdir -p 04_filtered/{params.dmulti[projmarkrun]}; obigrep -p "obiclean_internalcount == 0" {input} > {output} 2> {log} ; else touch {output} 2> {log} ; fi'''