output_fasta.sh 4.52 KB
Newer Older
khalid's avatar
khalid committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
### OUTPUT : fasta files sorted by categories (clean, contam, dubious, lowcov, overexp)
if [ $OUTPUTLEVEL == "1" ]; then
	echo -e "\nfasta files will not be written (output level set to '1')"

elif [ $OUTPUTLEVEL == "2" ]; then
	echo -e "\nWriting categorized transcriptomes - clean and low coverage only (it might take some time)"
	for fasta in $out/*.fasta_mod; do
		ref=`basename $fasta .fasta_mod`
		echo -e "\t$ref"

		# version awk
		awk -v ref=$ref -v out=$out  'BEGIN{RS=">"; FS="\t"} NR>1 {
		sub("\n","\t");
		gsub("\n","",$0);
		ident =split($1,a," ") #N.B the seq idents have been changed in line 37 (supr all text after the first space)
		print a[1]"\t"$2 > out"/"ref".togrep"
		#grpcmd = "LC_ALL=C fgrep -w -c -m 1 \""a[1] "\" "
		#go="_clean" # default state is clean
		#if (system(" [ -f " out"/"ref".lowcov ]") == 0) { cmd = grpcmd out"/"ref".lowcov"; cmd |& getline ret; close(cmd); if ( ret != 0 ) { go="_lowcov" } }
		#print RS$1"\n"$2 > out"/"ref""go".fasta"
		}' $fasta

		# tentative de sortie du awk pour acceleration
		for suff in lowcov contam dubious overexp; do
			if [ ! -f "$out/$ref.$suff" ]; then echo "" > $out/$ref.$suff; fi
		done
		cat $out/$ref".togrep" | while read line; do
			ctg=`echo $line | cut -d' ' -f1`
			go="_clean"
			if LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.lowcov && LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.all ; then go="_lowcov"  
			fi
		echo ">"$line >> $out"/"$ref""$go".fasta"
		done
		for f in $out/$ref\_*.fasta; do
			sed -i 's/ /\n/g' $f
		done
		# fin de tentative

		if [ -f $out/$ref"_clean.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_clean.fasta"; fi
		if [ -f $out/$ref"_lowcov.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_lowcov.fasta"; fi
	done


elif [ $OUTPUTLEVEL == "3" ]; then
	echo -e "\nWriting categorized transcriptomes - all categories (it might take some time)"
	for fasta in $out/*.fasta_mod; do
		ref=`basename $fasta .fasta_mod`
		echo -e "\t$ref"

		# version awk : 11 mn 35 s
		awk -v ref=$ref -v out=$out  'BEGIN{RS=">"; FS="\t"} NR>1 {
		sub("\n","\t");
		gsub("\n","",$0);
		ident =split($1,a," ") #N.B the seq idents have been changed in line 37 (supr all text after the first space)
		print a[1]"\t"$2 > out"/"ref".togrep"
		#grpcmd = "LC_ALL=C grep -F -w -c -m 1 \""a[1] "\" "
		#go="_clean" # default state is clean
		#if (system(" [ -f " out"/"ref".lowcov ]") == 0) { cmd = grpcmd out"/"ref".lowcov"; cmd |& getline ret; close(cmd); if ( ret != 0 ) { go="_lowcov" } }
		#if (system(" [ -f " out"/"ref".contam ]") == 0 && ret == 0)  { cmd = grpcmd out"/"ref".contam"; cmd |& getline ret; close(cmd); if ( ret != 0 ) { go="_contam"} }
		#if (system(" [ -f " out"/"ref".dubious ]") == 0 && ret == 0)  { cmd = grpcmd out"/"ref".dubious"; cmd |& getline ret; close(cmd); if ( ret != 0 ) { go="_dubious"} }
		#if (system(" [ -f " out"/"ref".overexp ]") == 0 && ret == 0)  { cmd = grpcmd out"/"ref".overexp"; cmd |& getline ret; close(cmd); if ( ret != 0 ) { go="_overexp"} }
		#print RS$1"\n"$2 > out"/"ref""go".fasta"
		}' $fasta

		# tentative de sortie du awk pour acceleration : 8 mn 51 s
		for suff in lowcov contam dubious overexp; do
			if [ ! -f "$out/$ref.$suff" ]; then echo "" > $out/$ref.$suff; fi
		done
		cat $out/$ref".togrep" | while read line; do
			ctg=`echo $line | cut -d' ' -f1`
			go="_clean"
khalid's avatar
khalid committed
72
			if LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.lowcov && LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.all ; then go="_lowcov" 					# attention : $out/$ref.lowcov contient aussi les non-supect !!!
khalid's avatar
khalid committed
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
			elif LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.contam && LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.all ; then go="_contam"
			elif LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.dubious && LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.all ; then go="_dubious" 
			elif LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.overexp && LC_ALL=C grep -F -q -w -m1 "$ctg" $out/$ref.all ; then go="_overexp" 
			fi
		echo ">"$line >> $out"/"$ref""$go".fasta"
		done
		for f in $out/$ref\_*.fasta; do
			sed -i 's/ /\n/g' $f
		done
		# fin de tentative
 
		if [ -f $out/$ref"_clean.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_clean.fasta"; fi
		if [ -f $out/$ref"_lowcov.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_lowcov.fasta"; fi
		if [ -f $out/$ref"_dubious.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_dubious.fasta"; fi
		if [ -f $out/$ref"_overexp.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_overexp.fasta"; fi
		if [ -f $out/$ref"_contam.fasta" ]; then sed -i "s/$ref|//g" $out/$ref"_contam.fasta"; fi
	done

else echo -e "\nwarning : output level value must be set to either '1', '2' or '3' (default = '2')"
fi