Commit ad18ccc9 authored by khalid's avatar khalid
Browse files

Prise en charge d'un fichier de config. listant les fichiers à analyser

parent 6c932ed0
......@@ -153,7 +153,7 @@ As CroCo is a bash script, and therefore do not require to be installed. However
- BLAST-2.5.0+
**Mandatory dependencies - at least one mapping tool in the following list** :
- Bowtie-1.1.2
- Bowtie-1.2.1
- Kallisto-0.43.0
- Rapmap-0.1.0
......@@ -361,18 +361,29 @@ docker run -t -i -P -v /home/user/where/data/are:/CroCoData:rw crocodock /bin/b
# Inputs
The transcriptomes and raw data to be analyzed must be present in a given directory indicated by the user with the option `--in` (by default, CroCo will look for them in the current directory).
Also, **CroCo requires file names unity between transcriptomes and raw reads** as follows:
The transcriptomes and raw data to be analyzed must be present in a given directory indicated by the user with the option `--in`
Croco will scan a configuration file for a list of transcriptomes and raw reads to analyse .
The format is as follows:
NAME.fasta NAME.R1.fastq NAME.R2.fastq
NOM.fasta NOM.R1.fastq NOM.R2.fastq
**for paired-end reads** :
NAME.fasta (assembled transcriptome seqs)
NAME.L.fastq (raw illumina data LEFT)
NAME.R.fastq (raw illumina data RIGHT)
NAME.R1.fastq (raw illumina data LEFT)
NAME.R2.fastq (raw illumina data RIGHT)
**for unpaired reads** :
NAME.fasta (assembled transcriptome seqs)
NAME.fastq (raw illumina data)
The only naming convention are :
* The transcriptome files must end with ".fasta"
* If the raw illumina reads are gziped, the extension must be ".gz"
* The columns of the config file must be tab-separated
Transcriptomes should be in fasta format. It is good practice to avoid as much as possible any special characters (e.g. ``\/[]()|:;``) in sequence names, as the tools used within CroCo might complain about them. Also, CroCo will temporarily cut names after the first encountered spacing character, so please be sure that the first part (i.e. the first word) of every sequence name is sufficient as a unique ID. This is to handle long sequence names resulting from some assembling softwares, or richly annotated sequences. Of course, CroCo outputs assemblies with the original full sequence names provided.
Reads fastq files should use Phred33 as quality score scheme, which is usually the case by default
......@@ -545,7 +556,7 @@ It is possible this way to change these tools default parameter values according
*Example*:
Assuming you want to increase Bowtie precision by only allowing a maximum of 1 mismatch per alignment :
```bash
bash CroCo_v0.1.sh --mode u --tool B --add-option '-v 1'
bash CroCo_v0.1.sh --cnf configfile.txt --mode u --tool B --add-option '-v 1'
```
---
......
#!/bin/bash
# This tool will analyse all the .fasta files in the current directory
# The reads files must be in the same dir and must follow this naming convention
# FOR paired end: NAME.fasta, NAME.L.fastq, NAME.R.fastq
# FOR unpaired : NAME.fasta, NAME.fastq
# This tool will analyse all the .fasta files listed in a configuration file
# This config file must contain one line for each contig file and its reads in the form:
# contigs1.fasta reads1.R1.fatsq reads1.R2.fastq
# contigs2.fasta reads2.R1.fatsq reads2.R2.fastq
# FOR unpaired :
# contigsN.fasta readsN.fatsq
# blank spaces must be tabs
# fasta files and read files must be in the same dir and must follow this naming convention :
# contig files must be named with ".fasta" extension
# if read files are gziped their extension must be ".gz"
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
......@@ -37,11 +45,32 @@ START_TIME=$SECONDS
if [ $RECAT == "no" ]; then
### setting sample names
i=0
for fasta in $INDIR/*.fasta ; do
fasta_array[$i]=`basename $fasta .fasta`
i=$(( i + 1 ))
done
# i=0
#for fasta in $INDIR/*.fasta ; do
# fasta_array[$i]=`basename $fasta .fasta`
# i=$(( i + 1 ))
# done
declare -A fasta_array
while IFS=$'\t' read fastaFile R1 R2
do
fastaName=`basename $fastaFile .fasta`
fasta_array[$fastaName]=$fastaFile";"$R1";"$R2
InfosCtg=(${fasta_array[$fastaName]//;/ }) #c'est un tableau
len=${#InfosCtg[@]}
if [ $len -gt 2 ]; then
parity="Paired"
else
parity="unpaired"
fi
echo $fastaName ": with " $parity "reads"
for r in $(seq 0 $len)
do
echo ${InfosCtg[$r]}
done
#for i in "${InfosCtg[@]}"; do echo $i; done;
done < $CONFFILE
### detecting suspect transcripts with BLAST
source "$my_dir/detect_suspect_trans_with_blast.sh"
......@@ -58,17 +87,15 @@ if [ $RECAT == "no" ]; then
### index contigs for the selected tool
case "$TOOL" in
B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
#B2) toolidx="ALL_transcripts_bowtie2_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie2-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;;
#S) toolidx="ALL_transcripts_salmon_index" ; if [ ! -d $out/$toolidx ]; then salmon --no-version-check index -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -p -x $PROCESSORS -i $out/$toolidx; fi ;;
#H) toolidx="ALL_transcripts_hpg_index" ; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; hpg-aligner build-sa-index -g $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
esac
echo -e "\nIndex built : $out/$toolidx\n"
for (( j=0; j <i; j++ ))
#for (( j=0; j <i; j++ ))
for ref in "${!fasta_array[@]}"
do
ref=${fasta_array[$j]};
#ref=${fasta_array[$j]};
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
......@@ -85,20 +112,27 @@ if [ $RECAT == "no" ]; then
cat $out/*.ctgs > $refseqALL
# mapping successively every read sets on all transcripts.
for (( k=0; k <i; k++ ))
#for (( k=0; k <i; k++ ))
for ref in "${!fasta_array[@]}"
do
reads=${fasta_array[$k]}"_reads"
fileout=$out/${fasta_array[$k]}"_vs_ALL.out"
InfosCtg=${fasta_array[$ref]}
#reads=${fasta_array[$k]}"_reads"
#fileout=$out/${fasta_array[$k]}"_vs_ALL.out"
reads=${ref}"_reads"
fileout=$out/${ref}"_vs_ALL.out"
finalout=$out/"ALL_transcripts.all"
echo -e "\nMapping ${fasta_array[$k]} reads"
#echo -e "\nMapping ${fasta_array[$k]} reads"
echo -e "\nMapping ${ref} reads"
# calculate expression level with selected tool
case "$TOOL" in
B) if [ $MODE == "u" ]
then
fastq=$INDIR"/"${fasta_array[$k]}".fastq"
#fastq=$INDIR"/"${fasta_array[$k]}".fastq"
fastq=$INDIR"/"${InfosCtg[1]}
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[1]}
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --chunkmbs 2000 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
......@@ -107,54 +141,31 @@ if [ $RECAT == "no" ]; then
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
# B2) if [ $MODE == "u" ]
# then
# fastq=" -U "$INDIR"/"${fasta_array[$k]}".fastq"
# else
# fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
# fi
# bowtie2 -p $PROCESSORS --no-unal --no-head $ADDOPT -a --quiet --omit-sec-seq --trim5 $TRIM5 --trim3 $TRIM3 -x $out/$toolidx/$toolidx $fastq | grep -v "@" | cut -f3 | \
# awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
# END{
# print "Contig", reads;
# for (i in ctg) totRPK += ctg[i]/ctgsize[i];
# for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
# ;;
K) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${InfosCtg[1]}
else
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${InfosCtg[1]}
fi
else
fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq"
#fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=$INDIR"/"${InfosCtg[1]}" "$INDIR"/"${InfosCtg[2]}
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
# S) if [ $MODE == "u" ]
# then
# if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
# echo -e "\nWarning : When using unpaired data with Salmon, you might need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
# fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
# else
# fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
# fi
# else
# fastq=" -l IU -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" #see http://salmon.readthedocs.org/en/latest/library_type.html#fraglibtype
# fi
# salmon --no-version-check quant $ADDOPT --threads $PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
# awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
# { if(NR > 11) ctg[$1] = $3 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/quant.sf > $fileout
# ;;
R) if [ $MODE == "u" ]
then
fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
#fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" -r "$INDIR"/"${InfosCtg[1]}
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[2]}
fi
rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
......@@ -163,20 +174,6 @@ if [ $RECAT == "no" ]; then
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
# H) if [ $MODE == "u" ]
# then
# fastq=" -fq="$INDIR"/"${fasta_array[$k]}".fastq"
# else
# fastq=" --fq="$INDIR"/"${fasta_array[$k]}".L.fastq --fq2="$INDIR"/"${fasta_array[$k]}".R.fastq"
# fi
# mkdir $fileout.hpg;
# hpg-aligner dna --report-n-best=1 $ADDOPT --cpu-threads=$PROCESSORS -i=$out/$toolidx $fastq -o $fileout.hpg
# awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
# END{
# print "Contig", reads;
# for (i in ctg) totRPK += ctg[i]/ctgsize[i];
# for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' $fileout.hpg > $fileout
# ;;
esac
if [ -f $finalout ]
......@@ -195,9 +192,10 @@ if [ $RECAT == "no" ]; then
# care for character ";" in sequence names ?
# splitting "All_transcript.all" file into files corresponding to samples
for (( j=0; j <i; j++ ))
#for (( j=0; j <i; j++ ))
for ref in "${!fasta_array[@]}"
do
ref=${fasta_array[$j]}
#ref=${fasta_array[$j]}
echo -e "\nCategorization of $ref transcripts"
echo -e `head -n1 $finalout` > $out/$ref".all"
grep "$ref|" $out/All_transcripts.quants >> $out/$ref".all"
......@@ -289,18 +287,18 @@ elif [ $RECAT != "no" ]; then
# preparing re-categorization
i=0
for allfile in $RECAT/*.all ; do
fasta_array[$i]=`basename $allfile .all`
cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${fasta_array[$i]}".all"
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".ctgs" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_mod" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".suspects" $out
recatfasta_array[$i]=`basename $allfile .all`
cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${recatfasta_array[$i]}".all"
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".ctgs" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_mod" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".suspects" $out
i=$(( i + 1 ))
done
# re-categorizing transcipts (clean, contam, dubious, lowcov, overexp)
for (( j=0; j <i; j++ )); do
ref=${fasta_array[$j]};
ref=${recatfasta_array[$j]};
refseqs=$out/$ref".ctgs"
finalout=$out/$ref".all"
echo -e "Re-categorizing $ref transcripts"
......
# add sample name to all sequence names and build BLASTdb
for (( j=0; j <i; j++ )); do
ref=${fasta_array[$j]}
awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"
sed -i "s/>/>$ref|/g" $out/${fasta_array[$j]}".fasta_mod"
makeblastdb -in $out/${fasta_array[$j]}".fasta_mod" -parse_seqids -dbtype nucl -out $out/${fasta_array[$j]}".blastdb"
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
#ref=${fasta_array[$j]}
#awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"
#sed -i "s/>/>$ref|/g" $out/${fasta_array[$j]}".fasta_mod"
#makeblastdb -in $out/${fasta_array[$j]}".fasta_mod" -parse_seqids -dbtype nucl -out $out/${fasta_array[$j]}".blastdb"
awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${ref}.fasta > $out/${ref}".fasta_mod"
sed -i "s/>/>$ref|/g" $out/${ref}".fasta_mod"
makeblastdb -in $out/${ref}".fasta_mod" -parse_seqids -dbtype nucl -out $out/${ref}".blastdb"
done
### initial version of the section :
# "all pairwise BLAST and listing suspects (see $SUSPID and $SUSPLEN)"
echo -e "\n"
for (( j=0; j <i; j++ )); do
ref=${fasta_array[$j]};
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
#ref=${fasta_array[$j]};
suspects=$out/$ref".suspects"
for (( k=0; k <i; k++ )); do
target=${fasta_array[$k]};
#for (( k=0; k <i; k++ )); do
for target in "${!fasta_array[@]}"
#target=${fasta_array[$k]};
if [ "$ref" != "$target" ]; then
outblast=$out/$ref"v"$target".outblast"
query=$out/$ref".fasta_mod"
......@@ -38,65 +46,3 @@ done
cat $out/*.suspects > $out/ALL_transcripts.suspects
cat $out/*.fasta_suspect > $out/ALL_transcripts.fasta_suspects
### TENTATIVE DE PARALLELISATION, MAIS IMPOSSIBLE DE WAIT LES PROCESSUS !!!
## all pairwise BLAST and listing suspects (see $SUSPID and $SUSPLEN)
#echo -e "\n"
#for (( j=0; j <i; j++ )); do
# ref=${fasta_array[$j]};
# suspects=$out/$ref".suspects"
# for (( k=0; k <i; k++ )); do
# target=${fasta_array[$k]};
# if [ "$ref" != "$target" ]; then
# echo -e "$ref\t$target" >> blast_job.list
# fi
# done
#done
#
#echo -e "all-vs-all BLAST"
#split --number=l/$PROCESSORS --additional-suffix=blastjob blast_job.list
#for run in x*blastjob; do
# (while IFS=$'\t' read ref target; do
# outblast=$out/$ref"v"$target".outblast"
# query=$out/$ref".fasta_mod"
# db=$out/$target".blastdb"
# echo -e "blastn -num_threads 1 -query $query -db $db -perc_identity $SUSPID -soft_masking true -max_target_seqs 5000 -outfmt \"6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen\" -out $outblast"
#
# blastn -num_threads 1 -query $query -db $db -perc_identity $SUSPID -soft_masking true -max_target_seqs 5000 -outfmt "6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen" -out $outblast
# done) & wait $!
#pids[$run]=$!
#echo -e "computing (processus ${pids[$run]})"
#echo "${pids[$run]}" >> blast_job.pid
#done
#cat blast_job.pid | while read pid; do
# echo -e "waiting for processus $pid..."
# wait $pid
#done
#rm -f x*blastjob blast_job.list blast_job.pid
# PB : "le processus n°12862 n'est pas un fils de ce shell." !!!!!!!!!!!!
#echo -e "\nCounting suspect transcripts"
#for (( j=0; j <i; j++ )); do
# ref=${fasta_array[$j]};
# suspects=$out/$ref".suspects"
# cat $out/$ref"v"*".outblast" > $out/$ref".outblast" ; rm -f $out/$ref"v"*".outblast"
# cat $out/$ref".outblast" | awk -v susplen=$SUSPLEN '{ if($5>=susplen){print $1} }' > $out/$ref".suspects_tmp"
# cat $out/$ref".suspects_tmp" | sort | uniq > $out/$ref".suspects" ; rm -f $suspects"_tmp"
# blastdbcmd -db $out/$ref".blastdb" -entry_batch $out/$ref".suspects" -outfmt %f -line_length 20000 -out $out/$ref".fasta_suspect_tmp"
# cat $out/$ref".fasta_suspect_tmp" | sed 's/lcl|//g' | awk '{if($0 ~ /^>/){print $1} else{print}}' > $out/$ref".fasta_suspect"
# rm -f $out/$ref".fasta_suspect_tmp"
# echo -e "\t$ref\t"`cat $out/$ref".suspects" | wc -l`
#done
......@@ -2,8 +2,9 @@ function printUsage(){
echo -e "\n`basename $0` is a program that can detect potential cross-contaminations in assembled transcriptomes using sequencing reads to find true origin of transcripts.
Usage :
$0 [--mode p|u] [--tool B|B2|K|R|S] [--fold-threshold INT] [--minimum-coverage FLOAT] [--threads INT] [--output-prefix STR] [--output-level 1|2|3] [--graph yes|no] [--trim5 INT] [--trim3 INT] [--frag-length FLOAT] [--frag-sd FLOAT] [--suspect-id INT] [--suspect-len INT] [--add-option STR] [--recat STR]
$0 [--cnf configFile] [--mode p|u] [--tool B|B2|K|R|S] [--fold-threshold INT] [--minimum-coverage FLOAT] [--threads INT] [--output-prefix STR] [--output-level 1|2|3] [--graph yes|no] [--trim5 INT] [--trim3 INT] [--frag-length FLOAT] [--frag-sd FLOAT] [--suspect-id INT] [--suspect-len INT] [--add-option STR] [--recat STR]
--cnf configFile : a text filename containg a liste of contigs assemblies to test and their associated fastq reads files [short: -k]
--mode p|u :\t\t\t'p' for paired and 'u' for unpaired (default : 'p') [short: -m]
--in STR :\t\t\tName of the directory containing the input files to be analyzed (DEFAULT : working directory) [short: -i]
--tool B|K|R :\t\t'B' for bowtie, 'K' for kallisto, 'R' for rapmap (DEFAULT : 'R') [short: -t]
......@@ -27,16 +28,16 @@ It is good practice to redirect information about each CroCo run into an output
'2>&1 | tee log_file'
Minimal working example :
CroCo_v0.1.sh --mode p 2>&1 | tee log_file
CroCo_v0.1.sh --cnf sampleconfig.txt --mode p 2>&1 | tee log_file
Exhaustive example :
CroCo_v0.1.sh --mode p --in data_folder_name --tool R --fold-threshold 2 --minimum-coverage 0.2 --overexp 300 --threads 8 --output-prefix test1_ --output-level 2 --graph yes --add-option '-v 0' --trim5 0 --trim3 0 --suspect-id 95 --suspect-len 40 --recat no 2>&1 | tee log_file
CroCo_v0.1.sh --cnf configFile --mode p --in data_folder_name --tool R --fold-threshold 2 --minimum-coverage 0.2 --overexp 300 --threads 8 --output-prefix test1_ --output-level 2 --graph yes --add-option '-v 0' --trim5 0 --trim3 0 --suspect-id 95 --suspect-len 40 --recat no 2>&1 | tee log_file
Exhaustive example using shortcuts :
CroCo_v0.1.sh -m p -i data_folder_name -t R -f 2 -c 0.2 -d 300 -n 8 -p test1_ -l 2 -g yes -a '-v 0' -x 0 -y 0 -s 95 -w 40 -r no 2>&1 | tee log_file
CroCo_v0.1.sh -k configFile -m p -i data_folder_name -t R -f 2 -c 0.2 -d 300 -n 8 -p test1_ -l 2 -g yes -a '-v 0' -x 0 -y 0 -s 95 -w 40 -r no 2>&1 | tee log_file
Example for re-categorizing previous CroCo results
CroCo_v0.1.sh -i data_folder_name -r previous_CroCo_results_folder_name -f 10 -c 0.5 -g yes 2>&1 | tee log_file
CroCo_v0.1.sh --cnf configFile -i data_folder_name -r previous_CroCo_results_folder_name -f 10 -c 0.5 -g yes 2>&1 | tee log_file
"
}
......@@ -54,7 +55,7 @@ function printAndUsageAndExit(){
number_re='^[0-9]+$'
float_re='^[0-9]+([.][0-9]+)?$'
ARGS=$(getopt -o m:i:f:x:y:c:t:n:p:l:g:a:u:v:s:r:w:d: --long mode:,in:,fold-threshold:,trim5:,trim3:,minimum-coverage:,tool:,threads:,output-prefix:,output-level:,graph:,add-option:,frag-length:,frag-sd:,suspect-id:,recat:,suspect-len:,overexp: -n "$0" -- "$@");
ARGS=$(getopt -o k:m:i:f:x:y:c:t:n:p:l:g:a:u:v:s:r:w:d: --long cnf:,mode:,in:,fold-threshold:,trim5:,trim3:,minimum-coverage:,tool:,threads:,output-prefix:,output-level:,graph:,add-option:,frag-length:,frag-sd:,suspect-id:,recat:,suspect-len:,overexp: -n "$0" -- "$@");
#Bad arguments
if [ $? -ne 0 ] || [ $# -eq 0 ];
......@@ -67,6 +68,15 @@ eval set -- "$ARGS";
while true; do
case "$1" in
-k|--cnf)
shift;
if [ -n "$1" ]; then
CONFFILE=$1
else
printAndUsageAndExit "You have to set a non-empty value for option --cnf to liste the files you want to analyze"
fi
shift;
;;
-m|--mode)
shift;
if [ -n "$1" ]; then
......@@ -186,11 +196,8 @@ while true; do
TOOL=$1
case "$TOOL" in
B) which bowtie > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find bowtie in utils/bin/bowtie-1.1.2/ or in PATH" ;fi ;;
B2) which bowtie2 > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find bowtie2 in utils/bin/bowtie2-2.2.9/ or in PATH" ;fi ;;
K) which kallisto > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find kallisto in utils/bin/kallisto/ or in PATH" ;fi ;;
S) which salmon > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find salmon in utils/bin/salmon/install/bin/ or in PATH" ;fi ;;
R) which rapmap > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find rapmap in utils/bin/rapmap/bin/ or in PATH" ;fi ;;
H) which hpg-aligner > /dev/null; ret=$?; if ((ret!=0)); then printAndUsageAndExit "Could not find hpg-aligner in utils/bin/hpg-aligner/bin/ or in PATH" ;fi ;;
esac
else
printAndUsageAndExit "'$1' is an incorrect value for --tool option (B, K and R accepted)"
......
......@@ -4,10 +4,7 @@ crossScriptDir=$(readlink -f "$0")
crosscontamdir=`dirname $crossScriptDir`
crosscontamtopdir=`dirname $crosscontamdir`
bowtie_path="${crosscontamtopdir}/utils/bin/bowtie-1.1.2"
bowtie2_path="${crosscontamtopdir}/utils/bin/bowtie2-2.2.9"
kallisto_path="${crosscontamtopdir}/utils/bin/kallisto/"
Salmon_path="${crosscontamtopdir}/utils/bin/salmon/install/bin"
RapMap_path="${crosscontamtopdir}/utils/bin/rapmap/bin/"
HPG_path="${crosscontamtopdir}/utils/bin/hpg-aligner/bin/"
BLAST_path="${crosscontamtopdir}/utils/bin/ncbi-blast-2.5.0+/bin"
export PATH=$bowtie_path:$bowtie2_path:$kallisto_path:$Salmon_path:$RapMap_path:$HPG_path:$BLAST_path:$PATH
export PATH=$bowtie_path:$kallisto_path:$RapMap_path:$BLAST_path:$PATH
......@@ -6,6 +6,7 @@ if [ $RECAT != "no" ]; then
echo
echo "SETTINGS"
echo
echo "cnf: $CONFFILE"
echo "in : $INDIR"
echo "fold-threshold : $FOLD"
echo "minimum-coverage : $MINCOV"
......@@ -19,6 +20,7 @@ elif [ $RECAT == "no" ]; then
echo
echo "SETTINGS"
echo
echo "cnf: $CONFFILE"
echo "mode : $MODE"
echo "in : $INDIR"
echo "tool : $TOOL"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment