Commit 8908ebf6 authored by psimion's avatar psimion
Browse files

many changes: correction if 0 contam; trying to use conf_file for input;...

many changes: correction if 0 contam; trying to use conf_file for input; introduction to using vsearch
parent 97541c4d
......@@ -9,18 +9,18 @@
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License (http://www.gnu.org/licenses/) for more details.
# This tool will analyse all the .fasta files listed in a configuration file
# This config file must contain one line for each contig file and its reads in the form:
# contigs1.fasta reads1.R1.fatsq reads1.R2.fastq
# contigs2.fasta reads2.R1.fatsq reads2.R2.fastq
# FOR unpaired :
# This tool will analyse all the *.fasta files listed in a configuration file
# This config file must contain one line for each contig file and its corresponding read files in the following format:
# FOR paired reads
# contigs1.fasta reads1.R1.fastq reads1.R2.fastq
# contigs2.fasta reads2.R1.fastq reads2.R2.fastq
# FOR unpaired reads :
# contigsN.fasta readsN.fatsq
# blank spaces must be tabs
# fasta files and read files must be in the same dir and must follow this naming convention :
# contig files must be named with ".fasta" extension
# if read files are gziped their extension must be ".gz"
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
......@@ -55,34 +55,47 @@ START_TIME=$SECONDS
if [ $RECAT == "no" ]; then
### setting sample names
# i=0
#for fasta in $INDIR/*.fasta ; do
# fasta_array[$i]=`basename $fasta .fasta`
# i=$(( i + 1 ))
# done
declare -A fasta_array
while IFS=$'\t' read fastaFile R1 R2
do
fastaName=`basename $fastaFile .fasta`
fasta_array[$fastaName]=$fastaFile";"$R1";"$R2
InfosCtg=(${fasta_array[$fastaName]//;/ }) #c'est un tableau
len=${#InfosCtg[@]}
if [ $len -gt 2 ]; then
parity="Paired"
else
parity="unpaired"
fi
echo $fastaName ": with " $parity "reads"
for r in $(seq 0 $len)
do
echo ${InfosCtg[$r]}
done
#for i in "${InfosCtg[@]}"; do echo $i; done;
done < $CONFFILE
if [ -f $CONFFILE ]; then
declare -A fasta_array
declare -a orders;
while IFS=$'\t' read fastaFile R1 R2
do
fastaName=`basename $fastaFile .fasta`
fasta_array[$fastaName]=$fastaFile";"$R1";"$R2
orders+=( $fastaName ) #this will keep fastaFileNames in the same order as in config file
InfosCtg=(${fasta_array[$fastaName]//;/ }) #c'est un tableau
len=${#InfosCtg[@]}
if [ $len -gt 2 ]; then
parity="Paired"
else
parity="unpaired"
fi
echo $fastaName ": with "$parity" reads"
for r in $(seq 0 $len)
do
echo ${InfosCtg[$r]}
done
done < $CONFFILE
else
print "Error - cannot find the following input file : '$CONFFILE'"
exit 1
fi
### TESTS
echo "fasta_array:"
echo "${!fasta_array[@]}"
echo
echo "orders:"
echo "${orders[@]}"
echo
###
### detecting suspect transcripts with BLAST
#source "$my_dir/detect_suspect_trans_with_blast.sh"
source "$my_dir/detect_suspect_trans_with_blast.sh"
### intermediate timing
......@@ -103,9 +116,8 @@ done < $CONFFILE
echo -e "\nIndex built : $out/$toolidx\n"
#for (( j=0; j <i; j++ ))
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"
do
#ref=${fasta_array[$j]};
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
......@@ -123,7 +135,7 @@ done < $CONFFILE
# mapping successively every read sets on all transcripts.
#for (( k=0; k <i; k++ ))
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"
do
InfosCtg=${fasta_array[$ref]}
#reads=${fasta_array[$k]}"_reads"
......@@ -133,6 +145,7 @@ done < $CONFFILE
finalout=$out/"ALL_transcripts.all"
#echo -e "\nMapping ${fasta_array[$k]} reads"
echo -e "\nMapping ${ref} reads"
echo -e "\nInfo Contigs :\t$InfosCtg\nOutfile :\t$fileout\n"
# calculate expression level with selected tool
case "$TOOL" in
......@@ -142,7 +155,7 @@ done < $CONFFILE
fastq=$INDIR"/"${InfosCtg[1]}
else
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[1]}
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[2]}
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --chunkmbs 2000 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
......@@ -162,7 +175,7 @@ done < $CONFFILE
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${InfosCtg[1]}
fi
else
#fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq"
#fastq=$INDIR"/"${fasta_array[$k]}".L.fastq.gz "$INDIR"/"${fasta_array[$k]}".R.fastq.gz"
fastq=$INDIR"/"${InfosCtg[1]}" "$INDIR"/"${InfosCtg[2]}
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
......@@ -203,7 +216,7 @@ done < $CONFFILE
# splitting "All_transcript.all" file into files corresponding to samples
#for (( j=0; j <i; j++ ))
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"
do
#ref=${fasta_array[$j]}
echo -e "\nCategorization of $ref transcripts"
......
#!/bin/bash
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License (http://www.gnu.org/licenses/) for more details.
# This tool will analyse all the *.fasta files listed in a configuration file
# This config file must contain one line for each contig file and its corresponding read files in the following format:
# FOR paired reads
# contigs1.fasta reads1.R1.fastq reads1.R2.fastq
# contigs2.fasta reads2.R1.fastq reads2.R2.fastq
# FOR unpaired reads :
# contigsN.fasta readsN.fatsq
# blank spaces must be tabs
# fasta files and read files must be in the same dir and must follow this naming convention :
# contig files must be named with ".fasta" extension
# if read files are gziped their extension must be ".gz"
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
source "$my_dir/MacLin_management.sh"
############################ DEFAULT VALUES ############################
source "$my_dir/def_values.sh"
############################ CROCO USAGE ############################
source "$my_dir/parse_params.sh"
# summary of settings
source "$my_dir/print_settings.sh"
# setting output directory
if [ $RECAT == "no" ]; then
out="$INDIR/${OUTPUTPREFIX}CroCo-"$TOOL"-id"$SUSPID"-len"$SUSPLEN"-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
elif [ $RECAT != "yes" ]; then
out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
fi
############################ PROGRAM BEGINS HERE ############################
START_TIME=$SECONDS
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### setting sample names
if [ -f $CONFFILE ]; then
declare -A fasta_array
declare -a orders;
while IFS=$'\t' read fastaFile R1 R2
do
fastaName=`basename $fastaFile .fasta`
fasta_array[$fastaName]=$fastaFile";"$R1";"$R2
orders+=( $fastaName ) #this will keep fastaFileNames in the same order as in config file
InfosCtg=(${fasta_array[$fastaName]//;/ }) #c'est un tableau
len=${#InfosCtg[@]}
if [ $len -gt 2 ]; then
parity="Paired"
else
parity="unpaired"
fi
echo $fastaName ": with "$parity" reads"
for r in $(seq 0 $len)
do
echo ${InfosCtg[$r]}
done
done < $CONFFILE
else
print "Error - cannot find the following input file : '$CONFFILE'"
exit 1
fi
### TESTS
echo "fasta_array:"
echo "${!fasta_array[@]}"
echo
echo "orders:"
echo "${orders[@]}"
echo
###
### detecting suspect transcripts with BLAST
#source "$my_dir/detect_suspect_trans_with_blast.sh"
source "$my_dir/detect_suspect_trans_with_blast.sh"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for detecting suspect transcripts : $h h $m m $s s"
echo
START_TIME2=$SECONDS
### index contigs for the selected tool
case "$TOOL" in
B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;;
R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -p -x $PROCESSORS -i $out/$toolidx; fi ;;
esac
echo -e "\nIndex built : $out/$toolidx\n"
#for (( j=0; j <i; j++ ))
for ref in "${orders[@]}"
do
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
NR>1 {
sub("\n","\t",$0);
gsub("\n","",$0);
ident =split($1,a," ") # N.B the seq idents (supr all text after the first space)
Seqlength = length($2) ;
print a[1]"\t"Seqlength > refseqs
}' $out/$ref".fasta_mod"
done
# regroup all samples
refseqALL=$out/ALL_transcripts.ctgs
cat $out/*.ctgs > $refseqALL
# mapping successively every read sets on all transcripts.
#for (( k=0; k <i; k++ ))
for ref in "${orders[@]}"
do
InfosCtg=${fasta_array[$ref]}
#reads=${fasta_array[$k]}"_reads"
#fileout=$out/${fasta_array[$k]}"_vs_ALL.out"
reads=${ref}"_reads"
fileout=$out/${ref}"_vs_ALL.out"
finalout=$out/"ALL_transcripts.all"
#echo -e "\nMapping ${fasta_array[$k]} reads"
echo -e "\nMapping ${ref} reads"
echo -e "\nInfo Contigs :\t$InfosCtg\nOutfile :\t$fileout\n"
# calculate expression level with selected tool
case "$TOOL" in
B) if [ $MODE == "u" ]
then
#fastq=$INDIR"/"${fasta_array[$k]}".fastq"
fastq=$INDIR"/"${InfosCtg[1]}
else
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[2]}
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --chunkmbs 2000 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
K) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${InfosCtg[1]}
else
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${InfosCtg[1]}
fi
else
#fastq=$INDIR"/"${fasta_array[$k]}".L.fastq.gz "$INDIR"/"${fasta_array[$k]}".R.fastq.gz"
fastq=$INDIR"/"${InfosCtg[1]}" "$INDIR"/"${InfosCtg[2]}
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
R) if [ $MODE == "u" ]
then
#fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" -r "$INDIR"/"${InfosCtg[1]}
else
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"${InfosCtg[1]}" -2 "$INDIR"/"${InfosCtg[2]}
fi
rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
esac
if [ -f $finalout ]
then
#join -t $'\t' --header $finalout $fileout > $finalout'.tmp'
paste $finalout $fileout | awk -F 'FS' 'BEGIN{FS="\t"}{for (i=1; i<=NF-1; i++) if(i!=NF-1) {printf $i FS};{print $NF}}' > $finalout'.tmp'
mv $finalout'.tmp' $finalout
else
cat $fileout > $finalout
fi
cp $finalout $out/All_transcripts.quants
done
# analyse mapping/count results: categorizing transcipts (clean, contam, dubious, lowcov, overexp)
# care for character ";" in sequence names ?
# splitting "All_transcript.all" file into files corresponding to samples
#for (( j=0; j <i; j++ ))
for ref in "${orders[@]}"
do
#ref=${fasta_array[$j]}
echo -e "\nCategorization of $ref transcripts"
echo -e `head -n1 $finalout` > $out/$ref".all"
grep "$ref|" $out/All_transcripts.quants >> $out/$ref".all"
#readarray -t suspects < $out/$ref".suspects"
#listeSuspects=$( IFS=';'; echo "${suspects[*]}" );
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else {
refCount = $col; ok="clean"; maxcov=0; nb_overexp=0;
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $out/$ref".all"
mv $out"/"$ref".tmp" $out/$ref".all"
# filtering out unsuspected transcripts
echo -e `head -n1 $finalout`"\tMaxOtherSpCov\tlog2FoldChange\tStatus" > $out/$ref".all_suspectonly"
nb=`cat $out/$ref".suspects" | wc -l`
LC_ALL=C grep -F -w -m$nb -f $out/$ref.suspects $out/$ref.all >> $out/$ref".all_suspectonly"
mv $out/$ref".all" $out/$ref".all_quants" ; mv $out/$ref".all_suspectonly" $out/$ref".all"
sed -i 's/ /\t/g' $out/$ref".all"
done
# http://www.inmotionhosting.com/support/website/ssh/speed-up-grep-searches-with-lc-all
# time grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_1 > grep.txt & # 533,85s user 0,14s system 99% cpu 8:54,47 total
# time LC_ALL=C grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_2 > LCgrep.txt & # 81,85s user 1,12s system 99% cpu 1:23,04 total
# time LC_ALL=C fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_3 > LCfgrep.txt & # 0,02s user 0,01s system 98% cpu 0,028 total
# time grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_4 > grepF.txt & # 0,05s user 0,01s system 99% cpu 0,058 total
# time LC_ALL=C grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > LCgrepF.txt & # 0,02s user 0,01s system 98% cpu 0,025 total
# time fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > fgrep.txt & # 0,05s user 0,01s system 99% cpu 0,055 total
## previous (and better) categorizaion version with suspect filtering inside awk
# awk -v outDir=$out -v ref=$ref -v col=$j -v list=$listeSuspects -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
# { if (NR == 1) {
# print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
# split(list, suspect, ";")
# for (i=1;i<=length(suspect);i++) transcripts[suspect[i]]++
# }
# else{
# refCount = $col; ok="clean"; maxcov=0; nb_overexp=0;
# if ($1 in transcripts) {
# for (i=2; i<= NF; i++) {
# if (i != col) {
# if ($i >= overexp) nb_overexp++
# if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
# else {
# if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
# }
# if ( $i > maxcov ) maxcov= $i
# }
# }
# if (maxcov < mincov && refCount < mincov ) ok="lowCov";
# if (refCount == 0) refCount=0.0001
# if (nb_overexp >= 3) ok="overexp";
# print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
# print $0 > outDir"/"ref"."ok
# }
# }
# }' $out/$ref".all"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME2))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for all mappings : $h h $m m $s s"
### RE-CATEGORIZATION ONLY
elif [ $RECAT != "no" ]; then
# preparing re-categorization
i=0
for allfile in $RECAT/*.all ; do
recatfasta_array[$i]=`basename $allfile .all`
cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${recatfasta_array[$i]}".all"
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".ctgs" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_mod" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".suspects" $out
i=$(( i + 1 ))
done
# re-categorizing transcipts (clean, contam, dubious, lowcov, overexp)
for (( j=0; j <i; j++ )); do
ref=${recatfasta_array[$j]};
refseqs=$out/$ref".ctgs"
finalout=$out/$ref".all"
echo -e "Re-categorizing $ref transcripts"
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else{
refCount = $col; ok="clean";
maxcov=0; #valeur max des couvertures des autres reads
nb_overexp=0
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $finalout
mv $out"/"$ref".tmp" $finalout
done
fi
### OUTPUT : basic statistics (cross_contamination_summary and cross_contamination_profiles files)
source "$my_dir/output_stats.sh"
### OUTPUT : fasta files sorted by categories (clean, contam, dubious, lowcov, overexp)
source "$my_dir/output_fasta.sh"
### OUTPUT : html file (detailed results for all transcripts)
#source "$my_dir/output_html.sh"
### OUTPUT : network files - step 1 ("LINKS.csv_exhaustive" and "nodes_detailed.csv")
source "$my_dir/output_network_basefiles_contam.sh"
### OUTPUT : network files - step 2 ("LINKS_exhaustive_dubious.csv" and "nodes_detailed_dubious.csv" -- using only dubious)
source "$my_dir/output_network_basefiles_dubious.sh"
### OUTPUT : network files - step 3 ("LINKS.csv")
#echo -e "\tLINKS_gephi.csv\n\tLINKS_diagrammer.tsv\n\tLINKS_gephi_dubious.csv\n\tLINKS_diagrammer_dubious.tsv"
source "$my_dir/output_network_LINKS_files.sh"
### OUTPUT : network files - step 4 ("LINKS_gephi_simplified.csv", "LINKS_diagrammer_simplified.tsv")
#echo -e "\tLINKS_gephi_simplified.csv\n\tLINKS_diagrammer_simplified.tsv"
#echo -e "\tLINKS_gephi_dubious_simplified.csv\n\tLINKS_diagrammer_dubious_simplified.tsv"
source "$my_dir/output_network_LINKS_simplified_files.sh"
### OUTPUT : network files - step 5 (various 'nodes' files)
#echo -e "\tNODES_gephi_absolute.csv\n\tNODES_gephi_norm.csv\n\tNODES_diagrammer.tsv\n\tNODES_gephi_dubious_absolute.csv\n\tNODES_gephi_dubious_norm.csv\n\tNODES_diagrammer_dubious.tsv"
source "$my_dir/output_network_NODES_files.sh"
### miscellaneous
mkdir network_info
mv LINKS_* NODES_* nodes_detailed*.csv network_info
### GRAPHICAL OUTPUT : network visualization ("network.html")
if [ $GRAPH == "no" ]; then
echo -e "\nNo graphical output will be written (graph parameter value set to 'no')"
elif [ $GRAPH == "yes" ] ; then
echo -e "\nRendering graphical visualizations of cross contamination network\n\tnetwork_complete.html\n\tnetwork_simplified.html\n\tnetwork_dubious_complete.html\n\tnetwork_dubious_simplified.html"
R --quiet --vanilla < $crosscontamdir/visNetwork.R 2>&1 >/dev/null
fi
### miscellaneous
if [ $RECAT == "no" ]; then
mkdir utility_files_CroCo
mv *.outblast *.suspects *.blastdb.* *_index *.ctgs *.out *.all_quants *.fasta_mod *.fasta_suspect utility_files_CroCo/
fi
cd ../
### final timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nAll results are in $out\n\nCroCo run finished (total run time : $h h $m m $s s)"
TOOL=R # B for Bowtie, B2 for bowtie2, K for kallisto or S for Salmon
TOOL=R # B for Bowtie, K for kallisto or S for Salmon
INDIR=$(pwd) # name of the input directory
FOLD=2 # how many fold enrichment of correct source over others is required to keep
MINCOV=0.2 # min coverage of contig by "own's" read to
......
TOOL=R # B for Bowtie, K for kallisto or S for Salmon
INDIR=$(pwd) # name of the input directory
FOLD=2 # how many fold enrichment of correct source over others is required to keep
MINCOV=0.2 # min coverage of contig by "own's" read to
PROCESSORS=1 # number of threads for running the tool
OUTPUTPREFIX="" # prefix of output directory that will be created
OUTPUTLEVEL=2 # "verbosity" level for fasta outputs
GRAPH=no # wether to produce a graphical output of the crosscontamination network
ADDOPT="" # string that will be added to the mapper command line
TRIM5=0 # number of bases to trim from read's 5'
TRIM3=0 # number of bases to trim from read's 3'
FRAGLENGTH=none # average fragment length
FRAGSD=none # standard deviation of fragment length
SUSPID=95 # blast minimum id% for being a potential suspect of cross contamination
SUSPLEN=40 # blast minimum id% for being a potential suspect of cross contamination
OVEREXP=300 # expression level (TPM) above which a transcript might be considered "overexpressed"
RECAT=no # previous CroCo output directory to be used to re-categorize transcripts
# add sample name to all sequence names and build BLASTdb
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]}
#awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"
#sed -i "s/>/>$ref|/g" $out/${fasta_array[$j]}".fasta_mod"
......@@ -16,12 +16,12 @@ done
# "all pairwise BLAST and listing suspects (see $SUSPID and $SUSPLEN)"
echo -e "\n"
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]};
suspects=$out/$ref".suspects"
#for (( k=0; k <i; k++ )); do
for target in "${!fasta_array[@]}"
for target in "${orders[@]}"; do
#target=${fasta_array[$k]};
if [ "$ref" != "$target" ]; then
outblast=$out/$ref"v"$target".outblast"
......
# add sample name to all sequence names and build BLASTdb
#for (( j=0; j <i; j++ )); do
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]}
#awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"