Commit 8795a3aa authored by psimion's avatar psimion
Browse files

completely re-written output_fasta.sh script

parent 7b065be1
......@@ -41,19 +41,18 @@ if [ $RECAT == "no" ]; then
echo "Output directory : $out"
echo
mkdir $out
mkdir $out/utility_files_CroCo
elif [ $RECAT != "yes" ]; then
out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
mkdir $out/utility_files_CroCo
fi
############################ PROGRAM BEGINS HERE ############################
START_TIME=$SECONDS
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### setting sample names
if [ -f $CONFFILE ]; then
......@@ -84,6 +83,8 @@ else
exit 1
fi
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### detecting suspect transcripts with BLAST
#source "$my_dir/detect_suspect_trans_with_blast.sh"
......@@ -175,14 +176,19 @@ fi
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
R) if [ $MODE == "u" ]
then
#fastq=" -r "$INDIR"/"${fasta_array[$k]}".fastq"
then
fastq=" -r "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
fi
rapmap quasimap -t $PROCESSORS $ADDOPT -i $out/$toolidx $fastq | grep -v "@" | cut -f3 | \
else
#### gunzip input read files does not work with Rapmap ?
#if [[ "$InfosCtg" == *".gz" ]]; then
#fastq="-1 <(gunzip -c "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`") -2 <(gunzip -c "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`")"
#else
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
#fi
fi
echo ""; echo "rapmap quasimap -t $PROCESSORS -i $out/$toolidx $fastq $ADDOPT " ; echo ""
rapmap quasimap -t $PROCESSORS -i $out/$toolidx $fastq $ADDOPT | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
......@@ -263,19 +269,24 @@ fi
### RE-CATEGORIZATION ONLY
elif [ $RECAT != "no" ]; then
i=0
for allfile in $RECAT/*.all ; do
recatfasta_array[$i]=`basename $allfile .all`
cat $allfile | sed -r 's/(\t[^\t]*){3}$//' > $out/${recatfasta_array[$i]}".all"
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".ctgs" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".fasta_mod" $out
cp $RECAT/utility_files_CroCo/${recatfasta_array[$i]}".suspects" $out
for ref in "${orders[@]}"; do
#for allfile in $RECAT/*.all ; do
echo -e "\n## moving files from :\n$RECAT/utility_files_CroCo/$ref\ninto:\n$out\n"
cat $RECAT/$ref.all | sed -r 's/(\t[^\t]*){3}$//' > $out/$ref".all"
cp $RECAT/utility_files_CroCo/$ref".ctgs" $out
cp $RECAT/utility_files_CroCo/$ref".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/$ref".fasta_mod" $out
cp $RECAT/utility_files_CroCo/$ref".suspects" $out
i=$(( i + 1 ))
done
# re-categorizing transcipts (clean, contam, dubious, lowcov, overexp)
for (( j=0; j <i; j++ )); do
ref=${recatfasta_array[$j]};
#for (( j=0; j <i; j++ )); do
j=0
for ref in "${orders[@]}"; do
#ref=${recatfasta_array[$j]};
refseqs=$out/$ref".ctgs"
finalout=$out/$ref".all"
echo -e "Re-categorizing $ref transcripts"
......@@ -305,6 +316,7 @@ elif [ $RECAT != "no" ]; then
}
}' $finalout
mv $out"/"$ref".tmp" $finalout
j=$((j+1))
done
fi
......@@ -358,7 +370,6 @@ fi
### miscellaneous
if [ $RECAT == "no" ]; then
mkdir utility_files_CroCo
mv *.outblast *.suspects *.blastdb.* *_index *.ctgs *.out *.all_quants *.fasta_mod *.fasta_suspect utility_files_CroCo/
fi
cd ../
......
#!/bin/bash
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License (http://www.gnu.org/licenses/) for more details.
# This tool will analyse all the *.fasta files listed in a configuration file
# This config file must contain one line for each contig file and its corresponding read files in the following format:
# FOR paired reads
# contigs1.fasta reads1.R1.fastq reads1.R2.fastq
# contigs2.fasta reads2.R1.fastq reads2.R2.fastq
# FOR unpaired reads :
# contigsN.fasta readsN.fatsq
# blank spaces must be tabs
# fasta files and read files must be in the same dir and must follow this naming convention :
# contig files must be named with ".fasta" extension
# if read files are gziped their extension must be ".gz"
my_dir="$(dirname "$0")"
source "$my_dir/path_management.sh"
source "$my_dir/MacLin_management.sh"
############################ DEFAULT VALUES ############################
source "$my_dir/def_values.sh"
############################ CROCO USAGE ############################
source "$my_dir/parse_params.sh"
# summary of settings
source "$my_dir/print_settings.sh"
# setting output directory
if [ $RECAT == "no" ]; then
out="$INDIR/${OUTPUTPREFIX}CroCo-"$TOOL"-id"$SUSPID"-len"$SUSPLEN"-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
mkdir $out/utility_files_CroCo
elif [ $RECAT != "yes" ]; then
out=$INDIR/${OUTPUTPREFIX}CroCo"_RECAT-fold"$FOLD"-mincov"$MINCOV"-"`date +"%Y_%m_%d-%I_%M"`
echo "Output directory : $out"
echo
mkdir $out
mkdir $out/utility_files_CroCo
fi
############################ PROGRAM BEGINS HERE ############################
START_TIME=$SECONDS
### setting sample names
if [ -f $CONFFILE ]; then
declare -A fasta_array
declare -a orders;
while IFS=$'\t' read fastaFile R1 R2
do
fastaName=`basename $fastaFile .fasta`
fasta_array[$fastaName]=$fastaFile";"$R1";"$R2
orders+=( $fastaName ) #this will keep fastaFileNames in the same order as in config file
InfosCtg=(${fasta_array[$fastaName]//;/ }) #c'est un tableau
len=${#InfosCtg[@]}
if [ $len -gt 2 ]; then
parity="Paired"
else
parity="unpaired"
fi
echo $fastaName ": with "$parity" reads"
for r in $(seq 0 $len)
do
echo ${InfosCtg[$r]}
done
done < $CONFFILE
else
print "Error - cannot find the following input file : '$CONFFILE'"
exit 1
fi
### run BLAST, mapping, and quantification if recat function is off (default)
if [ $RECAT == "no" ]; then
### detecting suspect transcripts with BLAST
#source "$my_dir/detect_suspect_trans_with_blast.sh"
source "$my_dir/detect_suspect_trans_with_blast.sh"
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for detecting suspect transcripts : $h h $m m $s s"
echo
START_TIME2=$SECONDS
### index contigs for the selected tool
case "$TOOL" in
B) toolidx="ALL_transcripts_bowtie_index"; if [ ! -d $out/$toolidx ]; then mkdir $out/$toolidx; bowtie-build --offrate 3 $out/ALL_transcripts.fasta $out/$toolidx/$toolidx ;fi ;;
K) toolidx="ALL_transcripts_kallisto_index"; if [ ! -f $out/$toolidx ]; then kallisto index -i $out/$toolidx $out/ALL_transcripts.fasta; fi ;;
R) toolidx="ALL_transcripts_rapmap_index" ; if [ ! -d $out/$toolidx ]; then rapmap quasiindex -t $out/ALL_transcripts.fasta -p -x $PROCESSORS -i $out/$toolidx; fi ;;
esac
echo -e "\nIndex built : $out/$toolidx\n"
### preparing contigs files with transcripts length
refseqALL=$out/ALL_transcripts.ctgs
touch $refseqALL
for ref in "${orders[@]}"
do
refseqs=$out/$ref".ctgs"
echo -e "Getting length of $ref transcripts"
awk -v ref=$ref -v refseqs=$refseqs 'BEGIN{RS=">"; FS="\t"}
NR>1 {
sub("\n","\t",$0);
gsub("\n","",$0);
ident =split($1,a," ") # N.B the seq idents (supr all text after the first space)
Seqlength = length($2) ;
print a[1]"\t"Seqlength > refseqs
}' $out/$ref".fasta_mod"
# regroup all samples
cat $refseqs >> $refseqALL
done
# mapping successively every read sets on all transcripts.
#for (( k=0; k <i; k++ ))
for ref in "${orders[@]}"
do
InfosCtg=${fasta_array[$ref]}
reads=${ref}"_reads"
fileout=$out/${ref}"_vs_ALL.out"
finalout=$out/"ALL_transcripts.all"
echo -e "\nMapping ${ref} reads\n"
#echo -e "\nInfo Contigs :\t$InfosCtg\nInfo Contigs detailed :\t${InfosCtg[0]};${InfosCtg[1]};${InfosCtg[2]} \
# \nfastq files :\t`echo $InfosCtg | cut -d';' -f2`\t`echo $InfosCtg | cut -d';' -f3` \
# \nOutfile :\t$fileout\nrefseqALL :\t$refseqALL\n"
# calculate expression level with selected tool
case "$TOOL" in
B) if [ $MODE == "u" ]
then
#fastq=$INDIR"/"${fasta_array[$k]}".fastq"
fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
#fastq=" -1 "$INDIR"/"${fasta_array[$k]}".L.fastq -2 "$INDIR"/"${fasta_array[$k]}".R.fastq"
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
fi
bowtie -p $PROCESSORS $ADDOPT -a --trim5 $TRIM5 --trim3 $TRIM3 --chunkmbs 2000 --suppress 1,2,4,5,6,7,8 $out/$toolidx/$toolidx $fastq | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
K) if [ $MODE == "u" ]
then
if [ $FRAGLENGTH == 'none' ] || [ $FRAGSD == 'none' ]; then
echo -e "\nWarning : When using unpaired data with Kallisto, you need to specify mean fragment length and fragment length standard deviation (--frag-length and --frag-sd options)"
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
#fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"${fasta_array[$k]}".fastq"
fastq=" --single -l $FRAGLENGTH -s $FRAGSD "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
fi
else
#fastq=$INDIR"/"${fasta_array[$k]}".L.fastq.gz "$INDIR"/"${fasta_array[$k]}".R.fastq.gz"
fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
fi
kallisto quant $ADDOPT --threads=$PROCESSORS -i $out/$toolidx -o $fileout.quant $fastq ;
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0;}; close(refseqs) }
{ if(NR>1) ctg[$1] = $5 } END{print "Contig", reads; for (i in ctg) print i, ctg[i]}' $fileout.quant/abundance.tsv > $fileout
;;
R) if [ $MODE == "u" ]
then
fastq=" -r "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
#### gunzip input read files does not work with Rapmap ?
#if [[ "$InfosCtg" == *".gz" ]]; then
#fastq="-1 <(gunzip -c "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`") -2 <(gunzip -c "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`")"
#else
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
#fi
fi
echo ""; echo "rapmap quasimap -t $PROCESSORS -i $out/$toolidx $fastq $ADDOPT " ; echo ""
rapmap quasimap -t $PROCESSORS -i $out/$toolidx $fastq $ADDOPT | grep -v "@" | cut -f3 | \
awk -v reads=$reads -v refseqs=$refseqALL 'BEGIN{OFS="\t"; while ((getline sequ < refseqs) > 0) {split(sequ,a,"\t");ctg[a[1]] = 0; ctgsize[a[1]]= a[2];}; close(refseqs) } {ctg[$1]++}
END{
print "Contig", reads;
for (i in ctg) totRPK += ctg[i]/ctgsize[i];
for (i in ctg) {if (totRPK > 0) {print i, (ctg[i]/ctgsize[i])*(1/totRPK)*1000000 } else {print i,"0"}} }' > $fileout
;;
esac
if [ -f $finalout ]
then
#join -t $'\t' --header $finalout $fileout > $finalout'.tmp'
paste $finalout $fileout | awk -F 'FS' 'BEGIN{FS="\t"}{for (i=1; i<=NF-1; i++) if(i!=NF-1) {printf $i FS};{print $NF}}' > $finalout'.tmp'
mv $finalout'.tmp' $finalout
else
cat $fileout > $finalout
fi
cp $finalout $out/All_transcripts.quants
done
# analyse mapping/count results: categorizing transcipts (clean, contam, dubious, lowcov, overexp)
# care for character ";" in sequence names ?
# splitting "All_transcript.all" file into files corresponding to samples
#for (( j=0; j <i; j++ ))
j=0
for ref in "${orders[@]}"
do
#ref=${fasta_array[$j]}
echo -e "\nCategorization of $ref transcripts"
echo -e `head -n1 $finalout` > $out/$ref".all"
grep "$ref|" $out/All_transcripts.quants >> $out/$ref".all"
#readarray -t suspects < $out/$ref".suspects"
#listeSuspects=$( IFS=';'; echo "${suspects[*]}" );
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else {
refCount = $col; ok="clean"; maxcov=0; nb_overexp=0;
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $out/$ref".all"
mv $out"/"$ref".tmp" $out/$ref".all"
# filtering out unsuspected transcripts
echo -e `head -n1 $finalout`"\tMaxOtherSpCov\tlog2FoldChange\tStatus" > $out/$ref".all_suspectonly"
nb=`cat $out/$ref".suspects" | wc -l`
LC_ALL=C grep -F -w -m$nb -f $out/$ref.suspects $out/$ref.all >> $out/$ref".all_suspectonly"
mv $out/$ref".all" $out/$ref".all_quants" ; mv $out/$ref".all_suspectonly" $out/$ref".all"
sed -i 's/ /\t/g' $out/$ref".all"
j=$((j+1))
done
### intermediate timing
ELAPSED_TIME=$(($SECONDS - $START_TIME2))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nExecution time for all mappings : $h h $m m $s s"
### RE-CATEGORIZATION ONLY
elif [ $RECAT != "no" ]; then
i=0
for ref in "${orders[@]}"; do
#for allfile in $RECAT/*.all ; do
echo -e "\n## moving files from :\n$RECAT/utility_files_CroCo/$ref\ninto:\n$out\n"
cat $RECAT/$ref.all | sed -r 's/(\t[^\t]*){3}$//' > $out/$ref".all"
cp $RECAT/utility_files_CroCo/$ref".ctgs" $out
cp $RECAT/utility_files_CroCo/$ref".fasta_suspect" $out
cp $RECAT/utility_files_CroCo/$ref".fasta_mod" $out
cp $RECAT/utility_files_CroCo/$ref".suspects" $out
i=$(( i + 1 ))
done
# re-categorizing transcipts (clean, contam, dubious, lowcov, overexp)
#for (( j=0; j <i; j++ )); do
j=0
for ref in "${orders[@]}"; do
#ref=${recatfasta_array[$j]};
refseqs=$out/$ref".ctgs"
finalout=$out/$ref".all"
echo -e "Re-categorizing $ref transcripts"
awk -v outDir=$out -v ref=$ref -v col=$j -v fold=$FOLD -v mincov=$MINCOV -v overexp=$OVEREXP 'BEGIN{OFS="\t";col=col+2;}
{ if (NR == 1) {
print $0, "MaxOtherSpCov", "log2FoldChange", "Status" > outDir"/"ref".tmp"
}
else{
refCount = $col; ok="clean";
maxcov=0; #valeur max des couvertures des autres reads
nb_overexp=0
for (i=2; i<= NF; i++) {
if (i != col) {
if ($i >= overexp) nb_overexp++
if ($i >= ( refCount * fold) && ($i >= mincov ) ) ok="contam"
else {
if ($i >= ( refCount / fold) && ($i >= mincov ) && ( ok != "contam" ) ) ok="dubious"
}
if ( $i > maxcov ) maxcov= $i
}
}
if (maxcov < mincov && refCount < mincov ) ok="lowcov";
if (refCount == 0) refCount=0.0001
if (nb_overexp >= 3) ok="overexp";
print $0, maxcov, log(refCount)/log(2) - log(maxcov)/log(2), ok > outDir"/"ref".tmp"
print $0 > outDir"/"ref"."ok
}
}' $finalout
mv $out"/"$ref".tmp" $finalout
j=$((j+1))
done
fi
### OUTPUT : basic statistics (cross_contamination_summary and cross_contamination_profiles files)
source "$my_dir/output_stats.sh"
### OUTPUT : fasta files sorted by categories (clean, contam, dubious, lowcov, overexp)
source "$my_dir/output_fasta.sh"
### OUTPUT : html file (detailed results for all transcripts)
#source "$my_dir/output_html.sh"
### OUTPUT : network files - step 1 ("LINKS.csv_exhaustive" and "nodes_detailed.csv")
source "$my_dir/output_network_basefiles_contam.sh"
### OUTPUT : network files - step 2 ("LINKS_exhaustive_dubious.csv" and "nodes_detailed_dubious.csv" -- using only dubious)
source "$my_dir/output_network_basefiles_dubious.sh"
### OUTPUT : network files - step 3 ("LINKS.csv")
#echo -e "\tLINKS_gephi.csv\n\tLINKS_diagrammer.tsv\n\tLINKS_gephi_dubious.csv\n\tLINKS_diagrammer_dubious.tsv"
source "$my_dir/output_network_LINKS_files.sh"
### OUTPUT : network files - step 4 ("LINKS_gephi_simplified.csv", "LINKS_diagrammer_simplified.tsv")
#echo -e "\tLINKS_gephi_simplified.csv\n\tLINKS_diagrammer_simplified.tsv"
#echo -e "\tLINKS_gephi_dubious_simplified.csv\n\tLINKS_diagrammer_dubious_simplified.tsv"
source "$my_dir/output_network_LINKS_simplified_files.sh"
### OUTPUT : network files - step 5 (various 'nodes' files)
#echo -e "\tNODES_gephi_absolute.csv\n\tNODES_gephi_norm.csv\n\tNODES_diagrammer.tsv\n\tNODES_gephi_dubious_absolute.csv\n\tNODES_gephi_dubious_norm.csv\n\tNODES_diagrammer_dubious.tsv"
source "$my_dir/output_network_NODES_files.sh"
### miscellaneous
mkdir network_info
mv LINKS_* NODES_* nodes_detailed*.csv network_info
### GRAPHICAL OUTPUT : network visualization ("network.html")
if [ $GRAPH == "no" ]; then
echo -e "\nNo graphical output will be written (graph parameter value set to 'no')"
elif [ $GRAPH == "yes" ] ; then
echo -e "\nRendering graphical visualizations of cross contamination network\n\tnetwork_complete.html\n\tnetwork_simplified.html\n\tnetwork_dubious_complete.html\n\tnetwork_dubious_simplified.html"
R --quiet --vanilla < $crosscontamdir/visNetwork.R 2>&1 >/dev/null
fi
### CLEANING READS
if [ $CLEANING == "yes" ]; then
echo -e "\nCleaning fastq files (reads mapping onto contaminant transcripts will be removed)"
source "$my_dir/output_cleaned_read_files.sh"
elif [ $CLEANING == "no" ] ; then
echo -e "\nInitial read sets will not be cleaned (parameter value set to 'no')"
fi
### miscellaneous
if [ $RECAT == "no" ]; then
mv *.outblast *.suspects *.blastdb.* *_index *.ctgs *.out *.all_quants *.fasta_mod *.fasta_suspect *.reads_to_discard utility_files_CroCo/
fi
cd ../
### final timing
ELAPSED_TIME=$(($SECONDS - $START_TIME))
h=$(($ELAPSED_TIME/3600))
m=$((($ELAPSED_TIME%3600)/60))
s=$(($ELAPSED_TIME%60))
echo -e "\nAll results are in $out\n\nCroCo run finished (total run time : $h h $m m $s s)"
# add sample name to all sequence names and build BLASTdb
#for (( j=0; j <i; j++ )); do
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]}
#awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"
#sed -i "s/>/>$ref|/g" $out/${fasta_array[$j]}".fasta_mod"
#makeblastdb -in $out/${fasta_array[$j]}".fasta_mod" -parse_seqids -dbtype nucl -out $out/${fasta_array[$j]}".blastdb"
awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${ref}.fasta > $out/${ref}".fasta_mod"
sed -i "s/>/>$ref|/g" $out/${ref}".fasta_mod"
makeblastdb -in $out/${ref}".fasta_mod" -parse_seqids -dbtype nucl -out $out/${ref}".blastdb"
done
### initial version of the section :
# "all pairwise BLAST and listing suspects (see $SUSPID and $SUSPLEN)"
echo -e "\n"
#for (( j=0; j <i; j++ )); do
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]};
suspects=$out/$ref".suspects"
#for (( k=0; k <i; k++ )); do
for target in "${orders[@]}"; do
#target=${fasta_array[$k]};
if [ "$ref" != "$target" ]; then
outblast=$out/$ref"v"$target".outblast"
query=$out/$ref".fasta_mod"
db=$out/$target".blastdb"
echo -e "Blasting $ref vs. $target"
blastn -num_threads $PROCESSORS -query $query -db $db -perc_identity $SUSPID -soft_masking true -max_target_seqs 5000 -outfmt "6 qseqid sseqid evalue pident bitscore qstart qend qlen sstart send slen" -out $outblast
fi
done
cat $out/$ref"v"*".outblast" > $out/$ref".outblast" ; rm -f $out/$ref"v"*".outblast"
cat $out/$ref".outblast" | awk -v susplen=$SUSPLEN '{ if($5>=susplen){print $1} }' > $out/$ref".suspects_tmp"
cat $out/$ref".suspects_tmp" | sort | uniq > $out/$ref".suspects" ; rm -f $suspects"_tmp"
blastdbcmd -db $out/$ref".blastdb" -entry_batch $out/$ref".suspects" -outfmt %f -line_length 20000 -out $out/$ref".fasta_suspect_tmp"
cat $out/$ref".fasta_suspect_tmp" | sed 's/lcl|//g' | awk '{if($0 ~ /^>/){print $1} else{print}}' > $out/$ref".fasta_suspect"
rm -f $out/$ref".fasta_suspect_tmp"
echo -e "\ttotal suspects transcripts in $ref : "`cat $out/$ref".suspects" | wc -l`
done
# regroup all samples
touch $out/ALL_transcripts.fasta
touch $out/ALL_transcripts.suspects
touch $out/ALL_transcripts.fasta_suspects
for ref in "${orders[@]}"; do
cat $out/$ref.fasta_mod >> $out/ALL_transcripts.fasta
cat $out/$ref.suspects >> $out/ALL_transcripts.suspects
cat $out/$ref.fasta_suspect >> $out/ALL_transcripts.fasta_suspects
done
#!/usr/bin/env bash
### Cleaning reads if they map onto contaminant transcripts
for ref in "${orders[@]}"; do
InfosCtg=${fasta_array[$ref]}
echo -e "${ref} reads"
echo -n -e "\t${ref}\t"
# preparing dictionnaries for $ref (to be improved by checking file existence before "cat" them)
cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
#cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
grep -E 'dubious|contam|overexp' $out/$ref.all | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
# building a list of "unclean" transcripts
# building an index of "unclean" transcripts
declare -A index
i=1
cat $out/$ref.contigstotrim | while read badcontig; do
while read badcontig; do
index[$badcontig]=$i
echo -e "$badcontig\t${index[$badcontig]}" >> $out/$ref.badcontigs
i=$((i+1))
done
done < <(cat $out/$ref.contigstotrim)
#cat $out/$ref.contigstotrim | while read badcontig; do
# for elem in ${!index[*]} ; do
# echo "Key \"${elem}\" : Value : "${index[${elem}]} #> $out/$ref.contigindex.content
# done
#done
# setting fastq files
# setting fastq files (paired/unpaired and zipped/unzipped)
if [ $MODE == "u" ]; then
fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
if [[ "$InfosCtg" == *".gz" ]]; then
fastq="<(zcat "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`")"
else
fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
fi
elif [ $MODE == "p" ]; then
if [[ "$InfosCtg" == *".gz" ]]; then
gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f2` > "$out/"`echo $InfosCtg | cut -d';' -f2`".gunziped"
gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f3` > "$out/"`echo $InfosCtg | cut -d';' -f3`".gunziped"
fastq="-1 "$out"/"`echo $InfosCtg | cut -d';' -f2`".gunziped -2 "$out"/"`echo $InfosCtg | cut -d';' -f3`".gunziped"
else
fastq="-1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
fi
fi
# setting bowtie idx
echo -n -e " | indexing"
bowtieindex=$out/$ref.index
mkdir $bowtieindex
fasta="$INDIR/"`echo $InfosCtg | cut -d';' -f1`
bowtie-build -q --offrate 3 $fasta $bowtieindex/$ref.index
# mapping reads, checking transcripts status and building list of "unclean" reads
# mapping reads, checking transcripts status and building a readindex of "unclean" reads
declare -A readindex
bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq | \
echo -n " | mapping"
command="bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq"
#echo ""; echo "$command"; echo ""
while read readname1 readname2 readname3 contig; do
echo -e "$readname1\t$readname2\t$readname3\t$contig\tindex[$contig]\t${index[$contig]}\t"${index[$contig]} >> $out/$ref.badreadmapped
if [[ "${index[$contig]}" -gt "0" <