#!/usr/bin/env bash ### Cleaning reads if they map onto contaminant transcripts for ref in "${orders[@]}"; do InfosCtg=${fasta_array[$ref]} echo -n -e "\t${ref}\t" # preparing dictionnaries for $ref (to be improved by checking file existence before "cat" them) #cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim grep -E 'dubious|contam|overexp' $out/$ref.all | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim # building an index of "unclean" transcripts declare -A index i=1 while read badcontig; do index[$badcontig]=$i echo -e "$badcontig\t${index[$badcontig]}" >> $out/$ref.badcontigs i=$((i+1)) done < <(cat $out/$ref.contigstotrim) #cat $out/$ref.contigstotrim | while read badcontig; do # for elem in ${!index[*]} ; do # echo "Key \"${elem}\" : Value : "${index[${elem}]} #> $out/$ref.contigindex.content # done #done # setting fastq files (paired/unpaired and zipped/unzipped) if [ $MODE == "u" ]; then if [[ "$InfosCtg" == *".gz" ]]; then fastq="<(zcat "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`")" else fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2` fi elif [ $MODE == "p" ]; then if [[ "$InfosCtg" == *".gz" ]]; then gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f2` > "$out/"`echo $InfosCtg | cut -d';' -f2`".gunziped" gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f3` > "$out/"`echo $InfosCtg | cut -d';' -f3`".gunziped" fastq="-1 "$out"/"`echo $InfosCtg | cut -d';' -f2`".gunziped -2 "$out"/"`echo $InfosCtg | cut -d';' -f3`".gunziped" else fastq="-1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3` fi fi # setting bowtie idx echo -n -e " | indexing" bowtieindex=$out/$ref.index mkdir $bowtieindex fasta="$INDIR/"`echo $InfosCtg | cut -d';' -f1` bowtie-build -q --offrate 3 $fasta $bowtieindex/$ref.index # mapping reads, checking transcripts status and building a readindex of "unclean" reads declare -A readindex echo -n " | mapping" command="bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq" #echo ""; echo "$command"; echo "" while read readname1 readname2 readname3 contig; do if [[ ${index[${contig}]} -ge "1" ]]; then readindex[$readname1]="bad"; fi done < <($command) echo -n " | filtering reads" for elem in ${!readindex[*]} ; do echo "${elem}" >> $out/$ref.reads_to_discard done # Cleaning fastq files from "unclean" reads : WAY TOO LONG ?? echo " | writing clean files" if [ $MODE == "u" ]; then fastqfile=`echo $InfosCtg | cut -d';' -f2` if [[ "$fastqfile" == *".gz" ]]; then newfastqname=`echo $fastqfile | sed 's/.gz//'` zcat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $newfastqname.clean else cat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $fastqfile.clean fi elif [ $MODE == "p" ]; then for fastqfile in `echo $InfosCtg | cut -d';' -f2` `echo $InfosCtg | cut -d';' -f3`; do if [[ "$fastqfile" == *".gz" ]]; then newfastqname=`echo $fastqfile | sed 's/.gz//'` zcat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $newfastqname.clean else cat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $fastqfile.clean fi done fi done # cleaning up files mv $out/*.index $out/*.reads_to_discard $out/utility_files_CroCo rm -f $out/*.gunziped rm -f $out/*.contigstotrim rm -f $out/*.badcontigs #cat "$INDIR/"$fastqfile | while read line; do # k=$((k+1)) # if [[ $((k%2)) -ne 0 ]]; then # readID=`echo $line | cut -d' ' -f1 | cut -d'.' -f2` # if [[ "${readindex[${readID}]}" == "bad" ]]; then # echo -e "\t\t$readID (contam !)" # contamflag=1 # elif [[ "${readindex[${readID}]}" != "bad" ]]; then # echo $line >> $fastqfile.clean # contamflag=0 # else # echo -e "\t\tproblem with read name !" # fi # elif [[ $((k%2)) -eq 0 ]] && [[ $contamflag -eq 0 ]]; then # echo $line >> $fastqfile.clean # fi #done