### dereplicate reads into uniq sequences rule dereplicate_samples: input: '01-raw/{run}/{sample}.fasta' output: '02-filtered/{run}/{sample}.uniq.fasta' singularity: config["container"] log: '../99-log/05-dereplicate_samples/{run}/{sample}.log' params: dir='02-filtered/{run}/' shell: '''mkdir -p {params.dir}; obiuniq -m sample {input} > {output} 2> {log}''' ### only sequence more than 20bp with no ambiguity IUAPC with total coverage greater than 10 reads rule goodlength_samples: input: '02-filtered/{run}/{sample}.uniq.fasta' output: '02-filtered/{run}/{sample}.l.u.fasta' singularity: config["container"] log: '../99-log/06-goodlength_samples/{run}/{sample}.log' params: count=config["good_length_samples"]["count"], seq_length=config["good_length_samples"]["seq_length"] shell: '''obigrep -p 'count>{params.count}' -s '^[ACGT]+$' -p 'seq_length>{params.seq_length}' {input} > {output} 2> {log}''' ### Clean the sequences for PCR/sequencing errors (sequence variants) rule clean_pcrerr_samples: input: '02-filtered/{run}/{sample}.l.u.fasta' output: '02-filtered/{run}/{sample}.r.l.u.fasta' singularity: config["container"] log: '../99-log/07-clean_pcrerr/{run}/{sample}.log' params: r=config["clean_pcrerr_samples"]["r"] shell: '''obiclean -r {params.r} {input} > {output} 2> {log}''' ### Remove sequence which are classified as 'internal' by obiclean rule rm_internal_samples: input: '02-filtered/{run}/{sample}.r.l.u.fasta' output: '03-cleaned/{run}/{sample}.c.r.l.u.fasta' params: dir='03-cleaned/{run}/' singularity: config["container"] log: '../99-log/08-rm_internal_samples/{run}/{sample}.log' shell: ''''mkdir -p {params.dir}; obigrep -p "obiclean_internalcount == 0" {input} > {output} 2> {log}'''