Commit 12d034b9 authored by khalid's avatar khalid
Browse files

Add lulu for post clustering curation

parent 277d26f4
rule <step_name>__lulu:
input:
**<step_name>__lulu_inputs(),
output:
curated_table = config["results_dir"] + "/" + config["<step_name>__lulu_output_dir"] + "/curated_table.tsv",
otu_map = config["results_dir"] + "/" + config["<step_name>__lulu_output_dir"] + "/otu_map.tsv",
params:
output_dir = config["results_dir"] + "/" + config["<step_name>__lulu_output_dir"]+ "/",
methode = config["<step_name>__lulu_matcher_method"],
minimum_ratio = config["<step_name>__lulu_minimum_ratio"],
minimum_match = config["<step_name>__lulu_minimum_match"],
log:
config["results_dir"] + "/logs/" + config["<step_name>__lulu_output_dir"] + "/lulu_log.txt"
shell:
"""
if [ "{params.methode}" == "blast" ]
then
makeblastdb -in {input.sequence_file} -parse_seqids -dbtype nucl;
blastn -db {input.sequence_file} -outfmt '6 qseqid sseqid pident' -out {params.output_dir}/match_list.txt -qcov_hsp_perc 80 -perc_identity 84 -query {input.sequence_file} ;
else
vsearch --usearch_global {input.sequence_file} --db {input.sequence_file} --self --id .84 --iddef 1 --userout {params.output_dir}/match_list.txt -userfields query+target+id --maxaccepts 0 --query_cov .9 --maxhits 10
fi;
otutab={input.abundance_file};
matchlist={params.output_dir}/match_list.txt ;
Rscript --no-save - "{params.minimum_ratio}" "{params.minimum_match}" "{output.curated_table}" "{output.otu_map}" <<-E0F |& tee {log}
library(lulu)
cat("R loaded")
args <- commandArgs(TRUE)
cat("Script args are:")
args
otutab <- read.csv("$otutab",sep='\\t',header=TRUE,as.is=TRUE, row.names = 1)
matchlist <- read.table("$matchlist", header=FALSE,as.is=TRUE, stringsAsFactors=FALSE)
curated_result <- lulu(otutab, matchlist, minimum_ratio_type = "min", minimum_ratio = as.numeric(args[1]), minimum_match = as.numeric(args[2]), minimum_relative_cooccurence = 0.95)
write.table(curated_result\$curated_table, file=args[3] ,append = FALSE, quote = FALSE, sep = "\t", dec = ".", row.names = TRUE, col.names = NA)
write.table(curated_result\$otu_map, file=args[4] ,append = FALSE, quote = FALSE, sep = "\t", dec = ".", row.names = TRUE, col.names = NA)
E0F
"""
{
id: lulu ,
name: lulu post clustering,
article: "https://www.nature.com/articles/s41467-017-01312-x",
website: "https://github.com/tobiasgf/lulu",
git: "https://github.com/tobiasgf/lulu",
description: "A r-package for distribution based post clustering curation of amplicon data.",
version: "latest",
documentation: "https://github.com/tobiasgf/lulu",
multiqc: "custom",
commands:
[
{
name: lulu,
cname: "lulu post clustering",
command: "",
category: "metabarcoding",
output_dir: lulu,
inputs: [
{ name: "abundance_file", type: "tsv", file: "abundance.tsv", description: "OTU table with samples as columns and OTUs as rows, with unique OTU id's as row names" },
{ name: sequence_file, type: "fasta_file", file: "OTU_seqs.fasta", description: "OTU sequences"},
],
outputs:
[
{ name: curated_table, type: "tsv", file: "curated_table.tsv", description: "Table of retained OTUs" },
{ name: otu_map, type: "tsv", file: "otu_map.tsv", description: "Table of information of which daughters were mapped to which parents OTUs" },
],
options:
[
{
name: lulu_matcher_method,
type: radio,
value: "blast",
choices: [blast: blast, vsearch: vsearch],
label: "tool for pair wise matching of OTU sequences",
},
{
name: lulu_minimum_ratio,
type: numeric,
value: 1,
min: 0,
max: 1,
step: 0.01,
label: "minimim abundance ratio between a potential
error and a potential parent",
},
{
name: lulu_minimum_match,
type: numeric,
value: 84,
min: 0,
max: 100,
step: 1,
label: "minimum threshold (%)of sequence similarity for considering
any OTU as an error of another",
}
],
},
],
install: {
lulu: [
"Rscript -e 'devtools::install_github(\"tobiasgf/lulu\")' "
],
VSEARCH: [
"cd /opt/biotools/",
" wget https://github.com/torognes/vsearch/releases/download/v2.21.1/vsearch-2.21.1-linux-x86_64.tar.gz",
" tar -zxvf vsearch-2.21.1-linux-x86_64.tar.gz",
" mv vsearch-2.21.1-linux-x86_64/bin/vsearch /opt/biotools/bin",
" rm -rf vsearch-2.21.1-linux-x86_64*"
],
blast: [
"cd /opt/biotools/",
"wget -O ncbi-blast-2.12.0+.tar.gz https://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.12.0/ncbi-blast-2.12.0+-x64-linux.tar.gz",
"tar -xvzf ncbi-blast-2.12.0+.tar.gz",
"ENV PATH $PATH:/opt/biotools/ncbi-blast-2.12.0+/bin"
],
},
citations: {
lulu: [
"Frøslev, T. G., Kjøller, R., Bruun, H. H., Ejrnæs, R., Brunbjerg, A. K., Pietroni, C., & Hansen, A. J. (2017). Algorithm for post-clustering curation of DNA amplicon data yields reliable biodiversity estimates. Nature Communications, 8(1), 1188."
],
}
}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment