output_cleaned_read_files.sh 4.13 KB
Newer Older
1
2
#!/usr/bin/env bash

3
### Cleaning reads if they map onto contaminant transcripts
psimion's avatar
psimion committed
4
5
for ref in "${orders[@]}"; do
	InfosCtg=${fasta_array[$ref]}
6
	echo -n -e "\t${ref}\t"
psimion's avatar
psimion committed
7
8

	# preparing dictionnaries for $ref (to be improved by checking file existence before "cat" them)
9
10
	#cat $out/$ref.dubious $out/$ref.contam $out/$ref.overexp | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
	grep -E 'dubious|contam|overexp' $out/$ref.all | cut -f1 | sed "s/$ref|//g" > $out/$ref.contigstotrim
psimion's avatar
psimion committed
11

12
	# building an index of "unclean" transcripts
psimion's avatar
psimion committed
13
14
	declare -A index
	i=1
15
16

	while read badcontig; do
psimion's avatar
psimion committed
17
18
19
		index[$badcontig]=$i
		echo -e "$badcontig\t${index[$badcontig]}" >> $out/$ref.badcontigs
		i=$((i+1))
20
21
22
23
24
25
26
	done < <(cat $out/$ref.contigstotrim)

	#cat $out/$ref.contigstotrim | while read badcontig; do
	#	for elem in ${!index[*]} ; do
	#		echo "Key \"${elem}\" : Value : "${index[${elem}]} 		#> $out/$ref.contigindex.content
	#	done
	#done
psimion's avatar
psimion committed
27

28
	# setting fastq files (paired/unpaired and zipped/unzipped)
psimion's avatar
psimion committed
29
	if [ $MODE == "u" ]; then
30
31
32
33
34
35
36
37
38
39
40
41
42
		if [[ "$InfosCtg" == *".gz" ]]; then
			fastq="<(zcat "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`")"
		else
			fastq=$INDIR"/"`echo $InfosCtg | cut -d';' -f2`
		fi
	elif [ $MODE == "p" ]; then
		if [[ "$InfosCtg" == *".gz" ]]; then
			gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f2` > "$out/"`echo $InfosCtg | cut -d';' -f2`".gunziped"
			gunzip -c "$INDIR/"`echo $InfosCtg | cut -d';' -f3` > "$out/"`echo $InfosCtg | cut -d';' -f3`".gunziped"
			fastq="-1 "$out"/"`echo $InfosCtg | cut -d';' -f2`".gunziped -2 "$out"/"`echo $InfosCtg | cut -d';' -f3`".gunziped"
		else
			fastq="-1 "$INDIR"/"`echo $InfosCtg | cut -d';' -f2`" -2 "$INDIR"/"`echo $InfosCtg | cut -d';' -f3`
		fi
psimion's avatar
psimion committed
43
44
45
	fi

	# setting bowtie idx
46
	echo -n -e " | indexing"
psimion's avatar
psimion committed
47
48
49
50
51
	bowtieindex=$out/$ref.index
	mkdir $bowtieindex
	fasta="$INDIR/"`echo $InfosCtg | cut -d';' -f1`
	bowtie-build -q --offrate 3 $fasta $bowtieindex/$ref.index

52
	# mapping reads, checking transcripts status and building a readindex of "unclean" reads
psimion's avatar
psimion committed
53
	declare -A readindex
54
55
56
57
	echo -n " | mapping"
	command="bowtie -p $PROCESSORS $ADDOPT --quiet -a --trim5 $TRIM5 --trim3 $TRIM3 --suppress 2,4,5,6,7,8 --chunkmbs 2000 $bowtieindex/$ref.index $fastq"
	#echo ""; echo "$command"; echo ""

psimion's avatar
psimion committed
58
	while read readname1 readname2 readname3 contig; do
59
60
61
62
63
64
		if [[ ${index[${contig}]} -ge "1" ]]; then readindex[$readname1]="bad"; fi
	done < <($command)

	echo -n " | filtering reads"
	for elem in ${!readindex[*]} ; do
		echo "${elem}" >> $out/$ref.reads_to_discard
psimion's avatar
psimion committed
65
66
	done

67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
	# Cleaning fastq files from "unclean" reads : 		WAY TOO LONG  ??
	echo " | writing clean files"
	if [ $MODE == "u" ]; then
		fastqfile=`echo $InfosCtg | cut -d';' -f2`
		if [[ "$fastqfile" == *".gz" ]]; then
			newfastqname=`echo $fastqfile | sed 's/.gz//'`
			zcat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $newfastqname.clean
		else
			cat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $fastqfile.clean
		fi
	elif [ $MODE == "p" ]; then
		for fastqfile in `echo $InfosCtg | cut -d';' -f2` `echo $InfosCtg | cut -d';' -f3`; do
			if [[ "$fastqfile" == *".gz" ]]; then
				newfastqname=`echo $fastqfile | sed 's/.gz//'`
				zcat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $newfastqname.clean
			else
				cat "$INDIR/"$fastqfile | paste - - - - | grep -v -w -F -f $out/$ref.reads_to_discard | tr "\t" "\n" > $fastqfile.clean
psimion's avatar
psimion committed
84
85
			fi
		done
86
87
	fi
done
psimion's avatar
psimion committed
88

89
90
91
92
93
# cleaning up files
mv $out/*.index $out/*.reads_to_discard $out/utility_files_CroCo
rm -f $out/*.gunziped
rm -f $out/*.contigstotrim
rm -f $out/*.badcontigs
psimion's avatar
psimion committed
94
95
96



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
		#cat "$INDIR/"$fastqfile | while read line; do
		#	k=$((k+1))
		#	if [[ $((k%2)) -ne 0 ]]; then
		#		readID=`echo $line | cut -d' ' -f1 | cut -d'.' -f2`
		#		if [[ "${readindex[${readID}]}" == "bad" ]]; then
		#			echo -e "\t\t$readID (contam !)"
		#			contamflag=1
		#		elif [[ "${readindex[${readID}]}" != "bad" ]]; then
		#			echo $line >> $fastqfile.clean
		#			contamflag=0
		#		else
		#			echo -e "\t\tproblem with read name !"
		#		fi
		#	elif [[ $((k%2)) -eq 0 ]] && [[ $contamflag -eq 0 ]]; then
		#		echo $line >> $fastqfile.clean
		#	fi
		#done
114
115
116
117