#!/bin/bash # This tool will analyse all the .fasta files in the current directory # The reads files must be in the same dir and must follow this naming convention # FOR paired end: NAME.fasta, NAME.L.fastq, NAME.R.fastq # FOR unpaired : NAME.fasta, NAME.fastq my_dir="$(dirname "$0")" source "$my_dir/path_management.sh" source "$my_dir/MacLin_management.sh" ############################ DEFAULT VALUES ############################ source "$my_dir/def_values.sh" ############################ CROCO USAGE ############################ source "$my_dir/parse_params.sh" # summary of settings source "$my_dir/print_settings.sh" # setting output directory if [ $RECAT == "no" ]; then out="$INDIR/${OUTPUTPREFIX}CroCo-"$TOOL"-id"$SUSPID"-len"$SUSPLEN"-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"` echo "Output directory : $out" echo mkdir $out elif [ $RECAT != "yes" ]; then out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"` echo "Output directory : $out" echo mkdir $out fi ############################ PROGRAM BEGINS HERE ############################ START_TIME=$SECONDS ### run BLAST, mapping, and quantification if recat function is off (default) if [ $RECAT == "no" ]; then ### setting sample names i=0 for fasta in $INDIR/*.fasta ; do fasta_array[$i]=`basename $fasta .fasta` i=$(( i + 1 )) done ### detecting suspect transcripts with BLAST source "$my_dir/detect_suspect_trans_with_blast.sh" ### intermediate timing ELAPSED_TIME=$(($SECONDS - $START_TIME)) h=$(($ELAPSED_TIME/3600)) m=$((($ELAPSED_TIME%3600)/60)) s=$(($ELAPSED_TIME%60)) echo -e "\nExecution time for detecting suspect transcripts : $h h $m m $s s" echo START_TIME2=$SECONDS ### index contigs for the selected tool echo -e "\nBuilding index : $out/$toolidx\n" case "$TOOL" in B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;; B2) toolidx="ALL_transcripts_bowtie2_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie2-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;; K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;; S) toolidx="ALL_transcripts_salmon_index" ; if [ ! -d $out/$toolidx ]; then salmon --no-version-check index -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;; R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;; H) toolidx="ALL_transcripts_hpg_index" ; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; hpg-aligner build-sa-index -g $out/ALL_transcripts.fasta -i $out/$toolidx; fi ;; esac for (( j=0; j 1 { sub("\n","\t",$0); gsub("\n","",$0); ident =split($1,a," ") # N.B the seq idents (supr all text after the first space) Seqlength = length($2) ; print a[1]"\t"Seqlength > refseqs }' $out/$ref".fasta_mod" done # regroup all samples refseqALL=$out/ALL_transcripts.ctgs cat $out/*.ctgs > $refseqALL # mapping successively every read sets on all transcripts. for (( k=0; k $out/${fasta_array[$k]}".mapping" awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++} END{ print "Contig", reads; for (i in ctg) totRPK += ctg[i]/ctgsize[i]; for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' < $out/${fasta_array[$k]}".mapping" > $fileout ;; B2) if [ $MODE == "u" ] then fastq=" -U "$INDIR"/"${fasta_array[$k]}".fastq" else fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" fi bowtie2 -p $PROCESSORS --no-unal --no-head $ADDOPT -a --quiet --omit-sec-seq --trim5 $TRIM5 --trim3 $TRIM3 -x $out/$toolidx/$toolidx $fastq | grep -v "@" | cut -f3 | \ awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++} END{ print "Contig", reads; for (i in ctg) totRPK += ctg[i]/ctgsize[i]; for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout ;; K) if [ $MODE == "u" ] then if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)" fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq" else fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq" fi else fastq=$INDIR"/"${fasta_array[$k]}".L.fastq "$INDIR"/"${fasta_array[$k]}".R.fastq" fi kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ; awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) } { if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout ;; S) if [ $MODE == "u" ] then if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then echo -e "\nWarning : When using unpaired data with Salmon, you might need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)" fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq" else fastq=" --fldMean $FRAGLENGTH --fldSD $FRAGSD -l U -r "$INDIR"/"${fasta_array[$k]}".fastq" fi else fastq=" -l IU -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" #see http://salmon.readthedocs.org/en/latest/library_type.html#fraglibtype fi salmon --no-version-check quant $ADDOPT --threads $PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ; awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) } { if(NR > 11) ctg[$1] = $3 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/quant.sf > $fileout ;; R) if [ $MODE == "u" ] then fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq" else fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq" fi rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \ awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++} END{ print "Contig", reads; for (i in ctg) totRPK += ctg[i]/ctgsize[i]; for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout ;; H) if [ $MODE == "u" ] then fastq=" -fq="$INDIR"/"${fasta_array[$k]}".fastq" else fastq=" --fq="$INDIR"/"${fasta_array[$k]}".L.fastq --fq2="$INDIR"/"${fasta_array[$k]}".R.fastq" fi mkdir $fileout.hpg; hpg-aligner dna --report-n-best=1 $ADDOPT --cpu-threads=$PROCESSORS -i=$out/$toolidx $fastq -o $fileout.hpg awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++} END{ print "Contig", reads; for (i in ctg) totRPK += ctg[i]/ctgsize[i]; for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' $fileout.hpg > $fileout ;; esac if [ -f $finalout ] then #join -t $'\t' --header $finalout $fileout > $finalout'.tmp' paste $finalout $fileout | awk -F 'FS' 'BEGIN{FS="\t"}{for (i=1; i<=NF-1; i++) if(i!=NF-1) {printf $i FS};{print $NF}}' > $finalout'.tmp' # => K = bon ordre pour tout le monde # => B = les *.out sont dans le bon ordre mv $finalout'.tmp' $finalout else cat $fileout > $finalout fi cp $finalout $out/All_transcripts.quants done # analyse mapping/count results: categorizing transcipts (clean, contam, dubious, lowcov, overexp) # care for character ";" in sequence names ? # splitting "All_transcript.all" file into files corresponding to samples for (( j=0; j $out/$ref".all" grep "$ref|" $out/All_transcripts.quants >> $out/$ref".all" #readarray -t suspects < $out/$ref".suspects" #listeSuspects=$( IFS=';'; echo "${suspects[*]}" ); awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;} { if (NR == 1) { print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp" } else { refCount = $col; ok="clean"; maxcov=0; nb_overexp=0; for (i=2; i<= NF; i++) { if (i != col) { if ($i >= overexp) nb_overexp++ if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam" else { if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious" } if ( $i > maxcov ) maxcov= $i } } if (maxcov < mincov && refCount < mincov ) ok="lowcov"; if (refCount == 0) refCount=0.0001 if (nb_overexp >= 3) ok="overexp"; print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp" print $0 > outDir"/"ref"."ok } }' $out/$ref".all" mv $out"/"$ref".tmp" $out/$ref".all" # filtering out unsuspected transcripts echo -e `head -n1 $finalout`"\tMaxOtherSpCov\tlog2FoldChange\tStatus" > $out/$ref".all_suspectonly" nb=`cat $out/$ref".suspects" | wc -l` LC_ALL=C grep -F -w -m$nb -f $out/$ref.suspects $out/$ref.all >> $out/$ref".all_suspectonly" mv $out/$ref".all" $out/$ref".all_quants" ; mv $out/$ref".all_suspectonly" $out/$ref".all" sed -i 's/ /\t/g' $out/$ref".all" done # http://www.inmotionhosting.com/support/website/ssh/speed-up-grep-searches-with-lc-all # time grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_1 > grep.txt & # 533,85s user 0,14s system 99% cpu 8:54,47 total # time LC_ALL=C grep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_2 > LCgrep.txt & # 81,85s user 1,12s system 99% cpu 1:23,04 total # time LC_ALL=C fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_3 > LCfgrep.txt & # 0,02s user 0,01s system 98% cpu 0,028 total # time grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_4 > grepF.txt & # 0,05s user 0,01s system 99% cpu 0,058 total # time LC_ALL=C grep -F -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > LCgrepF.txt & # 0,02s user 0,01s system 98% cpu 0,025 total # time fgrep -w -m5000 -f Contaminated.suspects Contaminated.all_quants_5 > fgrep.txt & # 0,05s user 0,01s system 99% cpu 0,055 total ## previous (and better) categorizaion version with suspect filtering inside awk # awk -v outDir=$out -v ref=$ref -v col=$j -v list=$listeSuspects -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;} # { if (NR == 1) { # print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp" # split(list, suspect, ";") # for (i=1;i<=length(suspect);i++) transcripts[suspect[i]]++ # } # else{ # refCount = $col; ok="clean"; maxcov=0; nb_overexp=0; # if ($1 in transcripts) { # for (i=2; i<= NF; i++) { # if (i != col) { # if ($i >= overexp) nb_overexp++ # if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam" # else { # if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious" # } # if ( $i > maxcov ) maxcov= $i # } # } # if (maxcov < mincov && refCount < mincov ) ok="lowCov"; # if (refCount == 0) refCount=0.0001 # if (nb_overexp >= 3) ok="overexp"; # print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp" # print $0 > outDir"/"ref"."ok # } # } # }' $out/$ref".all" ### intermediate timing ELAPSED_TIME=$(($SECONDS - $START_TIME2)) h=$(($ELAPSED_TIME/3600)) m=$((($ELAPSED_TIME%3600)/60)) s=$(($ELAPSED_TIME%60)) echo -e "\nExecution time for all mappings : $h h $m m $s s" ### RE-CATEGORIZATION ONLY elif [ $RECAT != "no" ]; then i=0 for allfile in $RECAT/*.all ; do fasta_array[$i]=`basename $allfile .all` cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${fasta_array[$i]}".all" cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".ctgs" $out cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_suspect" $out cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".fasta_mod" $out cp $RECAT/utility_files_CroCo/${fasta_array[$i]}".suspects" $out i=$(( i + 1 )) done for (( j=0; j outDir"/"ref".tmp" } else{ refCount = $col; ok="clean"; maxcov=0; #valeur max des couvertures des autres reads nb_overexp=0 for (i=2; i<= NF; i++) { if (i != col) { if ($i >= overexp) nb_overexp++ if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam" else { if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious" } if ( $i > maxcov ) maxcov= $i } } if (maxcov < mincov && refCount < mincov ) ok="lowcov"; if (refCount == 0) refCount=0.0001 if (nb_overexp >= 3) ok="overexp"; print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp" print $0 > outDir"/"ref"."ok } }' $finalout mv $out"/"$ref".tmp" $finalout done fi ### OUTPUT : basic statistics (cross_contamination_summary and cross_contamination_profiles files) source "$my_dir/output_stats.sh" ### OUTPUT : fasta files sorted by categories (clean, contam, dubious, lowcov, overexp) source "$my_dir/output_fasta.sh" ### OUTPUT : html file (detailed results for all transcripts) #source "$my_dir/output_html.sh" ### OUTPUT : network files - step 1 ("LINKS.csv_exhaustive" and "nodes_detailed.csv") source "$my_dir/output_network_basefiles_contam.sh" ### OUTPUT : network files - step 2 ("LINKS_exhaustive_dubious.csv" and "nodes_detailed_dubious.csv" -- using only dubious) source "$my_dir/output_network_basefiles_dubious.sh" ### OUTPUT : network files - step 3 ("LINKS.csv") #echo -e "\tLINKS_gephi.csv\n\tLINKS_diagrammer.tsv\n\tLINKS_gephi_dubious.csv\n\tLINKS_diagrammer_dubious.tsv" source "$my_dir/output_network_LINKS_files.sh" ### OUTPUT : network files - step 4 ("LINKS_gephi_simplified.csv", "LINKS_diagrammer_simplified.tsv") #echo -e "\tLINKS_gephi_simplified.csv\n\tLINKS_diagrammer_simplified.tsv" #echo -e "\tLINKS_gephi_dubious_simplified.csv\n\tLINKS_diagrammer_dubious_simplified.tsv" source "$my_dir/output_network_LINKS_simplified_files.sh" ### OUTPUT : network files - step 5 (various 'nodes' files) #echo -e "\tNODES_gephi_absolute.csv\n\tNODES_gephi_norm.csv\n\tNODES_diagrammer.tsv\n\tNODES_gephi_dubious_absolute.csv\n\tNODES_gephi_dubious_norm.csv\n\tNODES_diagrammer_dubious.tsv" source "$my_dir/output_network_NODES_files.sh" ### miscellaneous mkdir network_info mv LINKS_* NODES_* nodes_detailed*.csv network_info ### GRAPHICAL OUTPUT : network visualization ("network.html") if [ $GRAPH == "no" ]; then echo -e "\nNo graphical output will be written (graph parameter value set to 'no')" elif [ $GRAPH == "yes" ] ; then echo -e "\nRendering graphical visualizations of cross contamination network\n\tnetwork_complete.html\n\tnetwork_simplified.html\n\tnetwork_dubious_complete.html\n\tnetwork_dubious_simplified.html" R --quiet --vanilla < $crosscontamdir/visNetwork.R 2>&1 >/dev/null fi ### miscellaneous if [ $RECAT == "no" ]; then mkdir utility_files_CroCo mv *.outblast *.suspects *.blastdb.* *_index *.ctgs *.out *.all_quants *.fasta_mod *.fasta_suspect ALL_transcripts.* utility_files_CroCo/ fi cd ../ ### final timing ELAPSED_TIME=$(($SECONDS - $START_TIME)) h=$(($ELAPSED_TIME/3600)) m=$((($ELAPSED_TIME%3600)/60)) s=$(($ELAPSED_TIME%60)) echo -e "\nAll results are in $out\n\nCroCo run finished (total run time : $h h $m m $s s)"