Commit 6a48544b authored by khalid's avatar khalid
Browse files

Initial commit

parents
This diff is collapsed.
File added
#!/bin/bash
# This tool will analyse all the .fasta files in the current directory
# The reads files must be in the same dir and must follow this naming convention
# FOR paired end: NAME.fasta, NAME.L.fastq, NAME.R.fastq
# FOR unpaired : NAME.fasta, NAME.fastq
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
source "$my_dir/MacLin_management.sh"
############################ DEFAULT VALUES ############################
source "$my_dir/def_values.sh"
############################ CROCO USAGE ############################
source "$my_dir/parse_params.sh"
# summary of settings
source "$my_dir/print_settings.sh"
# setting output directory
if [ $RECAT == "no" ]; then
out="$INDIR/${OUTPUTPREFIX}CroCo-"$TOOL"-id"$SUSPID"-len"$SUSPLEN"-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
elif [ $RECAT != "yes" ]; then
out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
fi
############################ PROGRAM BEGINS HERE ############################
START_TIME=$SECONDS
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### setting sample names
i=0
for fasta in $INDIR/*.fasta ; do
fasta_array[$i]=`basename $fasta .fasta`
i=$(( i + 1 ))
done
### detecting suspect transcripts with BLAST
source "$my_dir/detect_suspect_trans_with_blast.sh"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for detecting suspect transcripts : $h h $m m $s s"
echo
START_TIME2=$SECONDS
### index contigs for the selected tool
echo -e "\nBuilding index : $out/$toolidx\n"
case "$TOOL" in
B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
B2) toolidx="ALL_transcripts_bowtie2_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie2-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;;
S) toolidx="ALL_transcripts_salmon_index" ; if [ ! -d $out/$toolidx ]; then salmon --no-version-check index -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
H) toolidx="ALL_transcripts_hpg_index" ; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; hpg-aligner build-sa-index -g $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
esac
for (( j=0; j <i; j++ ))
do
ref=${fasta_array[$j]};
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
NR>1 {
sub("\n","\t",$0);
gsub("\n","",$0);
ident =split($1,a," ") # N.B the seq idents (supr all text after the first space)
Seqlength = length($2) ;
print a[1]"\t"Seqlength > refseqs
}' $out/$ref".fasta_mod"
done
# regroup all samples
refseqALL=$out/ALL_transcripts.ctgs
cat $out/*.ctgs > $refseqALL
# mapping successively every read sets on all transcripts.
for (( k=0; k <i; k++ ))
do
reads=${fasta_array[$k]}"_reads"
fileout=$out/${fasta_array[$k]}"_vs_ALL.out"
finalout=$out/"ALL_transcripts.all"
echo -e "\nMapping ${fasta_array[$k]} reads"
# calculate expression level with selected tool
case "$TOOL" in
B) if [ $MODE == "u" ]
then
fastq=$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq > $out/${fasta_array[$k]}".mapping"
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' < $out/${fasta_array[$k]}".mapping" > $fileout
;;
B2) if [ $MODE == "u" ]
then
fastq=" -U "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
bowtie2 -p $PROCESSORS --no-unal --no-head $ADDOPT -a --quiet --omit-sec-seq --trim5 $TRIM5 --trim3 $TRIM3 -x $out/$toolidx/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
K) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fi
else
fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
S) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Salmon, you might need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
fi
else
fastq=" -l IU -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" #see http://salmon.readthedocs.org/en/latest/library_type.html#fraglibtype
fi
salmon --no-version-check quant $ADDOPT --threads $PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR > 11) ctg[$1] = $3 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/quant.sf > $fileout
;;
R) if [ $MODE == "u" ]
then
fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
H) if [ $MODE == "u" ]
then
fastq=" -fq="$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --fq="$INDIR"/"${fasta_array[$k]}".L.fastq --fq2="$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
mkdir $fileout.hpg;
hpg-aligner dna --report-n-best=1 $ADDOPT --cpu-threads=$PROCESSORS -i=$out/$toolidx $fastq -o $fileout.hpg
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' $fileout.hpg > $fileout
;;
esac
if [ -f $finalout ]
then
#join -t $'\t' --header $finalout $fileout > $finalout'.tmp'
paste $finalout $fileout | awk -F 'FS' 'BEGIN{FS="\t"}{for (i=1; i<=NF-1; i++) if(i!=NF-1) {printf $i FS};{print $NF}}' > $finalout'.tmp'
# => K = bon ordre pour tout le monde
# => B = les *.out sont dans le bon ordre
mv $finalout'.tmp' $finalout
else
cat $fileout > $finalout
fi
cp $finalout $out/All_transcripts.quants
done
# analyse mapping/count results: categorizing transcipts (clean, contam, dubious, lowcov, overexp)
# care for character ";" in sequence names ?
# splitting "All_transcript.all" file into files corresponding to samples
for (( j=0; j <i; j++ ))
do
ref=${fasta_array[$j]}
echo -e "\nCategorization of $ref transcripts"
echo -e `head -n1 $finalout` > $out/$ref".all"
grep "$ref|" $out/All_transcripts.quants >> $out/$ref".all"
#readarray -t suspects < $out/$ref".suspects"
#listeSuspects=$( IFS=';'; echo "${suspects[*]}" );
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else {
refCount = $col; ok="clean"; maxcov=0; nb_overexp=0;
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $out/$ref".all"
mv $out"/"$ref".tmp" $out/$ref".all"
# filtering out unsuspected transcripts
echo -e `head -n1 $finalout`"\tMaxOtherSpCov\tlog2FoldChange\tStatus" > $out/$ref".all_suspectonly"
nb=`cat $out/$ref".suspects" | wc -l`
LC_ALL=C grep -F -w -m$nb -f $out/$ref.suspects $out/$ref.all >> $out/$ref".all_suspectonly"
mv $out/$ref".all" $out/$ref".all_quants" ; mv $out/$ref".all_suspectonly" $out/$ref".all"
sed -i 's/ /\t/g' $out/$ref".all"
done
# http://www.inmotionhosting.com/support/website/ssh/speed-up-grep-searches-with-lc-all
# time grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_1 > grep.txt & # 533,85s user 0,14s system 99% cpu 8:54,47 total
# time LC_ALL=C grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_2 > LCgrep.txt & # 81,85s user 1,12s system 99% cpu 1:23,04 total
# time LC_ALL=C fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_3 > LCfgrep.txt & # 0,02s user 0,01s system 98% cpu 0,028 total
# time grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_4 > grepF.txt & # 0,05s user 0,01s system 99% cpu 0,058 total
# time LC_ALL=C grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > LCgrepF.txt & # 0,02s user 0,01s system 98% cpu 0,025 total
# time fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > fgrep.txt & # 0,05s user 0,01s system 99% cpu 0,055 total
## previous (and better) categorizaion version with suspect filtering inside awk
# awk -v outDir=$out -v ref=$ref -v col=$j -v list=$listeSuspects -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
# { if (NR == 1) {
# print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
# split(list, suspect, ";")
# for (i=1;i<=length(suspect);i++) transcripts[suspect[i]]++
# }
# else{
# refCount = $col; ok="clean"; maxcov=0; nb_overexp=0;
# if ($1 in transcripts) {
# for (i=2; i<= NF; i++) {
# if (i != col) {
# if ($i >= overexp) nb_overexp++
# if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
# else {
# if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
# }
# if ( $i > maxcov ) maxcov= $i
# }
# }
# if (maxcov < mincov && refCount < mincov ) ok="lowCov";
# if (refCount == 0) refCount=0.0001
# if (nb_overexp >= 3) ok="overexp";
# print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
# print $0 > outDir"/"ref"."ok
# }
# }
# }' $out/$ref".all"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME2))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for all mappings : $h h $m m $s s"
### RE-CATEGORIZATION ONLY
elif [ $RECAT != "no" ]; then
i=0
for allfile in $RECAT/*.all ; do
fasta_array[$i]=`basename $allfile .all`
cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${fasta_array[$i]}".all"
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".ctgs" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_mod" $out
cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".suspects" $out
i=$(( i + 1 ))
done
for (( j=0; j <i; j++ )); do
ref=${fasta_array[$j]};
refseqs=$out/$ref".ctgs"
finalout=$out/$ref".all"
echo -e "Re-categorizing $ref transcripts"
# re-categorizing transcipts (clean, contam, dubious, lowcov, overexp)
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else{
refCount = $col; ok="clean";
maxcov=0; #valeur max des couvertures des autres reads
nb_overexp=0
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $finalout
mv $out"/"$ref".tmp" $finalout
done
fi
### OUTPUT : basic statistics (cross_contamination_summary and cross_contamination_profiles files)
source "$my_dir/output_stats.sh"
### OUTPUT : fasta files sorted by categories (clean, contam, dubious, lowcov, overexp)
source "$my_dir/output_fasta.sh"
### OUTPUT : html file (detailed results for all transcripts)
#source "$my_dir/output_html.sh"
### OUTPUT : network files - step 1 ("LINKS.csv_exhaustive" and "nodes_detailed.csv")
source "$my_dir/output_network_basefiles_contam.sh"
### OUTPUT : network files - step 2 ("LINKS_exhaustive_dubious.csv" and "nodes_detailed_dubious.csv" -- using only dubious)
source "$my_dir/output_network_basefiles_dubious.sh"
### OUTPUT : network files - step 3 ("LINKS.csv")
#echo -e "\tLINKS_gephi.csv\n\tLINKS_diagrammer.tsv\n\tLINKS_gephi_dubious.csv\n\tLINKS_diagrammer_dubious.tsv"
source "$my_dir/output_network_LINKS_files.sh"
### OUTPUT : network files - step 4 ("LINKS_gephi_simplified.csv", "LINKS_diagrammer_simplified.tsv")
#echo -e "\tLINKS_gephi_simplified.csv\n\tLINKS_diagrammer_simplified.tsv"
#echo -e "\tLINKS_gephi_dubious_simplified.csv\n\tLINKS_diagrammer_dubious_simplified.tsv"
source "$my_dir/output_network_LINKS_simplified_files.sh"
### OUTPUT : network files - step 5 (various 'nodes' files)
#echo -e "\tNODES_gephi_absolute.csv\n\tNODES_gephi_norm.csv\n\tNODES_diagrammer.tsv\n\tNODES_gephi_dubious_absolute.csv\n\tNODES_gephi_dubious_norm.csv\n\tNODES_diagrammer_dubious.tsv"
source "$my_dir/output_network_NODES_files.sh"
### miscellaneous
mkdir network_info
mv LINKS_* NODES_* nodes_detailed*.csv network_info
### GRAPHICAL OUTPUT : network visualization ("network.html")
if [ $GRAPH == "no" ]; then
echo -e "\nNo graphical output will be written (graph parameter value set to 'no')"
elif [ $GRAPH == "yes" ] ; then
echo -e "\nRendering graphical visualizations of cross contamination network\n\tnetwork_complete.html\n\tnetwork_simplified.html\n\tnetwork_dubious_complete.html\n\tnetwork_dubious_simplified.html"
R --quiet --vanilla < $crosscontamdir/visNetwork.R 2>&1 >/dev/null
fi
### miscellaneous
if [ $RECAT == "no" ]; then
mkdir utility_files_CroCo
mv *.outblast *.suspects *.blastdb.* *_index *.ctgs *.out *.all_quants *.fasta_mod *.fasta_suspect ALL_transcripts.* utility_files_CroCo/
fi
cd ../
### final timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nAll results are in $out\n\nCroCo run finished (total run time : $h h $m m $s s)"
#!/bin/bash
# This tool will analyse all the .fasta files in the current directory
# The reads files must be in the same dir and must follow this naming convention
# FOR paired end: NAME.fasta, NAME.L.fastq, NAME.R.fastq
# FOR unpaired : NAME.fasta, NAME.fastq
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
source "$my_dir/MacLin_management.sh"
############################ DEFAULT VALUES ############################
source "$my_dir/def_values.sh"
############################ CROCO USAGE ############################
source "$my_dir/parse_params.sh"
# summary of settings
source "$my_dir/print_settings.sh"
# setting output directory
if [ $RECAT == "no" ]; then
out="$INDIR/${OUTPUTPREFIX}CroCo-"$TOOL"-id"$SUSPID"-len"$SUSPLEN"-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
elif [ $RECAT != "yes" ]; then
out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
fi
############################ PROGRAM BEGINS HERE ############################
START_TIME=$SECONDS
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### setting sample names
i=0
for fasta in $INDIR/*.fasta ; do
fasta_array[$i]=`basename $fasta .fasta`
i=$(( i + 1 ))
done
### detecting suspect transcripts with BLAST
source "$my_dir/detect_suspect_trans_with_blast.sh"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for detecting suspect transcripts : $h h $m m $s s"
echo
START_TIME2=$SECONDS
### index contigs for the selected tool
echo -e "\nBuilding index : $out/$toolidx\n"
case "$TOOL" in
B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
B2) toolidx="ALL_transcripts_bowtie2_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie2-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;;
S) toolidx="ALL_transcripts_salmon_index" ; if [ ! -d $out/$toolidx ]; then salmon --no-version-check index -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
H) toolidx="ALL_transcripts_hpg_index" ; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; hpg-aligner build-sa-index -g $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;;
esac
for (( j=0; j <i; j++ ))
do
ref=${fasta_array[$j]};
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
NR>1 {
sub("\n","\t",$0);
gsub("\n","",$0);
ident =split($1,a," ") # N.B the seq idents (supr all text after the first space)
Seqlength = length($2) ;
print a[1]"\t"Seqlength > refseqs
}' $out/$ref".fasta_mod"
done
# regroup all samples
refseqALL=$out/ALL_transcripts.ctgs
cat $out/*.ctgs > $refseqALL
# mapping successively every read sets on all transcripts.
for (( k=0; k <i; k++ ))
do
reads=${fasta_array[$k]}"_reads"
fileout=$out/${fasta_array[$k]}"_vs_ALL.out"
finalout=$out/"ALL_transcripts.all"
echo -e "\nMapping ${fasta_array[$k]} reads"
# calculate expression level with selected tool
case "$TOOL" in
B) if [ $MODE == "u" ]
then
fastq=$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq > $out/${fasta_array[$k]}".mapping"
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' < $out/${fasta_array[$k]}".mapping" > $fileout
;;
B2) if [ $MODE == "u" ]
then
fastq=" -U "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
bowtie2 -p $PROCESSORS --no-unal --no-head $ADDOPT -a --quiet --omit-sec-seq --trim5 $TRIM5 --trim3 $TRIM3 -x $out/$toolidx/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
K) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fi
else
fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
S) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Salmon, you might need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq"
fi
else
fastq=" -l IU -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" #see http://salmon.readthedocs.org/en/latest/library_type.html#fraglibtype
fi
salmon --no-version-check quant $ADDOPT --threads $PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR > 11) ctg[$1] = $3 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/quant.sf > $fileout
;;
R) if [ $MODE == "u" ]
then
fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
H) if [ $MODE == "u" ]
then
fastq=" -fq="$INDIR"/"${fasta_array[$k]}".fastq"
else
fastq=" --fq="$INDIR"/"${fasta_array[$k]}".L.fastq --fq2="$INDIR"/"${fasta_array[$k]}".R.fastq"
fi
mkdir $fileout.hpg;
hpg-aligner dna --report-n-best=1 $ADDOPT --cpu-threads=$PROCESSORS -i=$out/$toolidx $fastq -o $fileout.hpg
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' $fileout.hpg > $fileout
;;
esac
if [ -f $finalout ]
then
#join -t $'\t' --header $finalout $fileout > $finalout'.tmp'
paste $finalout $fileout | awk -F 'FS' 'BEGIN{FS="\t"}{for (i=1; i<=NF-1; i++) if(i!=NF-1) {printf $i FS};{print $NF}}' > $finalout'.tmp'
# => K = bon ordre pour tout le monde
# => B = les *.out sont dans le bon ordre
mv $finalout'.tmp' $finalout
else
cat $fileout > $finalout
fi
cp $finalout $out/All_transcripts.quants
done
# analyse mapping/count results: categorizing transcipts (clean, contam, dubious, lowcov, overexp)
# care for character ";" in sequence names ?
# splitting "All_transcript.all" file into files corresponding to samples
for (( j=0; j <i; j++ ))
do
ref=${fasta_array[$j]}