Commit 7b065be1 authored by psimion's avatar psimion
Browse files

draft for read cleaning option

parent c4b5d1aa
# add sample name to all sequence names and build BLASTdb
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]}
#awk '/^>/{ print $1 } ; /^[^>]/{ print $0 }' < $INDIR/${fasta_array[$j]}.fasta > $out/${fasta_array[$j]}".fasta_mod"
#sed -i "s/>/>$ref|/g" $out/${fasta_array[$j]}".fasta_mod"
......@@ -18,12 +18,12 @@ done
# "all pairwise BLAST and listing suspects (see $SUSPID and $SUSPLEN)"
echo -e "\n"
#for (( j=0; j <i; j++ )); do
for ref in "${!fasta_array[@]}"
for ref in "${orders[@]}"; do
#ref=${fasta_array[$j]};
suspects=$out/$ref".suspects"
#for (( k=0; k <i; k++ )); do
for target in "${!fasta_array[@]}"
for target in "${orders[@]}"; do
#target=${fasta_array[$k]};
if [ "$ref" != "$target" ]; then
outblast=$out/$ref"v"$target".outblast"
......@@ -44,7 +44,12 @@ done
# regroup all samples
cat $out/*.fasta_mod > $out/ALL_transcripts.fasta
cat $out/*.suspects > $out/ALL_transcripts.suspects
cat $out/*.fasta_suspect > $out/ALL_transcripts.fasta_suspects
touch $out/ALL_transcripts.fasta
touch $out/ALL_transcripts.suspects
touch $out/ALL_transcripts.fasta_suspects
for ref in "${orders[@]}"; do
cat $out/$ref.fasta_mod >> $out/ALL_transcripts.fasta
cat $out/$ref.suspects >> $out/ALL_transcripts.suspects
cat $out/$ref.fasta_suspect >> $out/ALL_transcripts.fasta_suspects
done
### Cleaning reads if they map onto contaminant transcripts
for ref in "${orders[@]}"; do
InfosCtg=${fasta_array[$ref]}
echo -e "${ref} reads"
# preparing dictionnaries for $ref (to be improved by checking file existence before "cat" them)
cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
# building a list of "unclean" transcripts
declare -A index
i=1
cat $out/$ref.contigstotrim | while read badcontig; do
index[$badcontig]=$i
echo -e "$badcontig\t${index[$badcontig]}" >> $out/$ref.badcontigs
i=$((i+1))
done
# setting fastq files
if [ $MODE == "u" ]; then
fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
else
fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
fi
# setting bowtie idx
bowtieindex=$out/$ref.index
mkdir $bowtieindex
fasta="$INDIR/"`echo $InfosCtg | cut -d';' -f1`
bowtie-build -q --offrate 3 $fasta $bowtieindex/$ref.index
# mapping reads, checking transcripts status and building list of "unclean" reads
declare -A readindex
bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq | \
while read readname1 readname2 readname3 contig; do
echo -e "$readname1\t$readname2\t$readname3\t$contig\tindex[$contig]\t${index[$contig]}\t"${index[$contig]} >> $out/$ref.badreadmapped
if [[ "${index[$contig]}" -gt "0" ]]; then # if contigs belongs to "*.contigstotrim"
readuniqID=`echo $readname1 | cut -d'.' -f2`
readindex["$readuniqID"]="bad" # corresponding read name is memorized
echo -e "\tto be removed\t($readuniqID mapped onto $contig; ${index[$contig]})"
fi
done
# Cleaning fastq files from "unclean" reads
# if fastq not zipped & paired
for fastqfile in `echo $InfosCtg | cut -d';' -f2` `echo $InfosCtg | cut -d';' -f3`; do
echo -e "\t$fastqfile"
k=0
contamflag=0
cat "$INDIR/"$fastqfile | while read line; do
k=$((k+1))
if [[ $((k%2)) -ne 0 ]]; then
#echo $line
#echo $line | cut -d' ' -f1
readID=`echo $line | cut -d' ' -f1 | cut -d'.' -f2`
echo -e "\t\t$readID (ligne impaire : k = $k) ; value = ${readindex[$readID]}"
if [[ -v ${readindex[$readID]} ]]; then
echo -e "\t\t$readID (contam !)"
contamflag=1
elif [[ ! -v ${readindex[$readID]} ]]; then
echo $line >> $fastqfile.clean
echo -e "\t\t$readID (clean)"
contamflag=0
else
echo -e "\t\tproblem with read name !"
fi
elif [[ $((k%2)) -eq 0 ]] && [[ $contamflag -eq 0 ]]; then
echo -e "\t\t$readID (ligne paire : k = $k)"
echo $line >> $fastqfile.clean
fi
done
done
echo -e ""
done
......
......@@ -55,7 +55,7 @@ function printAndUsageAndExit(){
number_re='^[0-9]+$'
float_re='^[0-9]+([.][0-9]+)?$'
ARGS=$(getopt -o k:m:i:f:x:y:c:t:n:p:l:g:a:u:v:s:r:w:d: --long cnf:,mode:,in:,fold-threshold:,trim5:,trim3:,minimum-coverage:,tool:,threads:,output-prefix:,output-level:,graph:,add-option:,frag-length:,frag-sd:,suspect-id:,recat:,suspect-len:,overexp: -n "$0" -- "$@");
ARGS=$(getopt -o k:m:i:f:x:y:c:t:n:p:l:g:a:u:v:s:r:w:d: --long cnf:,mode:,in:,fold-threshold:,trim5:,trim3:,minimum-coverage:,tool:,threads:,output-prefix:,output-level:,graph:,add-option:,frag-length:,frag-sd:,suspect-id:,recat:,suspect-len:,overexp:,readclean: -n "$0" -- "$@");
#Bad arguments
if [ $? -ne 0 ] || [ $# -eq 0 ];
......@@ -64,6 +64,7 @@ then
exit
fi
MODEFLAG=0
CONFFLAG=0
eval set -- "$ARGS";
while true; do
......@@ -71,6 +72,7 @@ while true; do
-k|--cnf)
shift;
if [ -n "$1" ]; then
CONFFLAG=1
CONFFILE=$1
else
printAndUsageAndExit "You have to set a non-empty value for option --cnf to list the files to be analyzed"
......@@ -307,14 +309,25 @@ while true; do
fi
shift;
;;
--readclean)
shift;
if [ -n "$1" ]; then
CLEANING=$1
else
printAndUsageAndExit "You have to set a non-empty value for option --readclean if you decide to use it"
fi
shift;
;;
--)
break;
;;
esac
done
if ((MODEFLAG == 0)) && [ $RECAT == "no" ]; then
printAndUsageAndExit "You must set the --mode option"
if (( MODEFLAG == 0 )) || (( CONFFLAG == 0 )) ; then
if [ $RECAT == "no" ]; then
printAndUsageAndExit "You must set both the --mode and the --conf options"
fi
fi
# alert for particular combinations of tool and mode
......
......@@ -15,6 +15,7 @@ if [ $RECAT != "no" ]; then
echo "output-level : $OUTPUTLEVEL"
echo "graph : $GRAPH"
echo "overexpression : $OVEREXP"
echo "cleaning reads : $CLEANING"
echo
elif [ $RECAT == "no" ]; then
echo
......@@ -40,5 +41,6 @@ elif [ $RECAT == "no" ]; then
echo "suspect_id : $SUSPID"
echo "suspect_len : $SUSPLEN"
echo "overexpression : $OVEREXP"
echo "cleaning reads : $CLEANING"
echo
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment