output_cleaned_read_files.sh 2.65 KB
Newer Older
1
### Cleaning reads if they map onto contaminant transcripts
psimion's avatar
psimion committed
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
for ref in "${orders[@]}"; do
	InfosCtg=${fasta_array[$ref]}
	echo -e "${ref} reads"

	# preparing dictionnaries for $ref (to be improved by checking file existence before "cat" them)
	cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim

	# building a list of "unclean" transcripts
	declare -A index
	i=1
	cat $out/$ref.contigstotrim | while read badcontig; do
		index[$badcontig]=$i
		echo -e "$badcontig\t${index[$badcontig]}" >> $out/$ref.badcontigs
		i=$((i+1))
	done

	# setting fastq files
	if [ $MODE == "u" ]; then
		fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
	else
		fastq=" -1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
	fi

	# setting bowtie idx
	bowtieindex=$out/$ref.index
	mkdir $bowtieindex
	fasta="$INDIR/"`echo $InfosCtg | cut -d';' -f1`
	bowtie-build -q --offrate 3 $fasta $bowtieindex/$ref.index

	# mapping reads, checking transcripts status and building list of "unclean" reads
	declare -A readindex
	bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq | \
	while read readname1 readname2 readname3 contig; do
		echo -e "$readname1\t$readname2\t$readname3\t$contig\tindex[$contig]\t${index[$contig]}\t"${index[$contig]} >> $out/$ref.badreadmapped
		if [[ "${index[$contig]}" -gt "0" ]]; then			# if contigs belongs to "*.contigstotrim"
			readuniqID=`echo $readname1 | cut -d'.' -f2`
			readindex["$readuniqID"]="bad"			# corresponding read name is memorized
			echo -e "\tto be removed\t($readuniqID mapped onto $contig; ${index[$contig]})"
		fi
	done

	# Cleaning fastq files from "unclean" reads
	# if fastq not zipped & paired
	for fastqfile in `echo $InfosCtg | cut -d';' -f2` `echo $InfosCtg | cut -d';' -f3`; do
		echo -e "\t$fastqfile"
		k=0
		contamflag=0
		cat "$INDIR/"$fastqfile | while read line; do
			k=$((k+1))
			if [[ $((k%2)) -ne 0 ]]; then
				#echo $line
				#echo $line | cut -d' ' -f1
				readID=`echo $line | cut -d' ' -f1 | cut -d'.' -f2`
				echo -e "\t\t$readID (ligne impaire : k = $k) ; value =  ${readindex[$readID]}"
				if [[ -v ${readindex[$readID]} ]]; then
					echo -e "\t\t$readID (contam !)"
					contamflag=1
				elif [[ ! -v ${readindex[$readID]} ]]; then
					echo $line >> $fastqfile.clean
					echo -e "\t\t$readID (clean)"
					contamflag=0
				else
					echo -e "\t\tproblem with read name !"
				fi
			elif [[ $((k%2)) -eq 0 ]] && [[ $contamflag -eq 0 ]]; then
				echo -e "\t\t$readID (ligne paire : k = $k)"
				echo $line >> $fastqfile.clean
			fi
		done
	done


	echo -e ""
done


78
79
80
81