filtering.sh 5.61 KB
Newer Older
eboulanger's avatar
eboulanger committed
1
2
3
#### Filtering steps by dDocent
#           tutorial: https://www.ddocent.com/filtering/

eboulanger's avatar
eboulanger committed
4
5
# define arguments

eboulanger's avatar
eboulanger committed
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
FILE=$1
FOLDER=$2
SPCODE=$3

PATH=$PATH:~/programmes/vcflib/bin/
cd $FOLDER

# step 1: filter genotypes called below 50% (across all individuals), 
#         filter SNPs that have a minor allele count less than 3
#         filter SNPs with a minimum quality score of 30

vcftools --vcf ../$FILE --max-missing 0.5 --mac 3 --minQ 30 --recode --recode-INFO-all --out g5mac3

# step 2: minimum depth for a genotype call
#         minimum mean depth

vcftools --vcf g5mac3.recode.vcf --minDP 3 --recode --recode-INFO-all --out g5mac3dp3 

# step 3: asses individual levels of missing data and remove individuals accordingly

vcftools --vcf g5mac3dp3.recode.vcf --missing-indv
cat out.imiss

# make histogram of percentage missing data

awk '!/IN/' out.imiss | cut -f5 > totalmissing
gnuplot << \EOF 
set terminal dumb size 120, 30
set autoscale 
unset label
set title "Histogram of % missing data per individual"
set ylabel "Number of Occurrences"
set xlabel "% of missing data"
#set yr [0:100000]
binwidth=0.01
bin(x,width)=width*floor(x/width) + binwidth/2.0
plot 'totalmissing' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF

# create list of individuals with more than 50% missing data

awk '$5 > 0.5' out.imiss | cut -f1 > lowDP.indv

# remove individuals from list

vcftools --vcf g5mac3dp3.recode.vcf --remove lowDP.indv --recode --recode-INFO-all --out g5mac3dplm

# step 4 : restrict data to variants called in a high percentage of individuals and filter by mean depth of genotypes

vcftools --vcf g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 5

###
# FreeBayes - specific filters

# look at header
awk '/#/' DP3g95maf05.recode.vcf

# step 5 : filter for allele balance
#          this requires vcflib

# population level: vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95p5maf05.recode.vcf > DP3g95p5maf05.fil1.vcf
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95maf05.recode.vcf > DP3g95maf05.fil1.vcf
# check how many were removed
awk '!/#/' DP3g95maf05.recode.vcf | wc -l
awk '!/#/' DP3g95maf05.fil1.vcf | wc -l

# step 6 : filter out sites that have reads from both strands
# vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
# awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# SKIP THIS STEP BECAUSE REMOVES TOO MANY SNPs FOR MULLUS

# step 7 : look at ration of mapping qualities between reference and alternate alleles 
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" DP3g95maf05.fil1.vcf > DP3g95maf05.fil3.vcf
awk '!/#/' DP3g95maf05.fil3.vcf | wc -l

# step 8 : check whether there is discrepancy in the properly paired status (SNAP IK NIET MAAR MAAKT BLIJKBAAR NIET UIT WANT GEEN SNPs WEGGEFILTERD)
vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s DP3g95maf05.fil3.vcf > DP3g95maf05.fil4.vcf
awk '!/#/' DP3g95maf05.fil4.vcf | wc -l

# step 9 : remove any locus that has a quality score below 1/4 of the depth
#          (read suggested blogposts to understand why)
vcffilter -f "QUAL / DP > 0.25" DP3g95maf05.fil4.vcf > DP3g95maf05.fil5.vcf
awk '!/#/' DP3g95maf05.fil5.vcf | wc -l

# step 10 : consists of multiple steps 
## 1 : create list of depth of each locus
cut -f8 DP3g95maf05.fil5.vcf | grep -oe "DP=[0-9]*" | sed 's/DP=//g' > DP3g95maf05.fil5.DEPTH
## 2 : create list of quality scores
awk '!/#/' DP3g95maf05.fil5.vcf | cut -f1,2,6 > DP3g95maf05.fil5.vcf.loci.qual
## 3 : calculate mean depth
MEAN_DP=`awk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' DP3g95maf05.fil5.DEPTH`
## 4 : take the mean depth plus 3X the square of the mean 
MEAN_DP_SQ=`python -c "print int($MEAN_DP+3*($MEAN_DP**0.5))"`
echo $MEAN_DP_SQ
## 5 : paste depth and quality files together and find the loci above the cutoff that do not have quality scores 2 times the depth
paste DP3g95maf05.fil5.vcf.loci.qual DP3g95maf05.fil5.DEPTH | awk -v x=$MEAN_DP_SQ '$4 > x' | awk '$3 < 2 * $4' > DP3g95maf05.fil5.lowQDloci
## 6 : remove those sites and recalculate depth across loci
vcftools --vcf DP3g95maf05.fil5.vcf --site-depth --exclude-positions DP3g95maf05.fil5.lowQDloci --out DP3g95maf05.fil5
## 7 : cut depth scores, calculate average depth and plot
NINDIV=`grep "#CHROM" DP3g95maf05.recode.vcf | tr "\t" "\n " | tail -n +10 | wc -l ` # get the number of individuals
cut -f3 DP3g95maf05.fil5.ldepth > DP3g95maf05.fil5.site.depth
awk '!/D/' DP3g95maf05.fil5.site.depth | awk -v x="$NINDIV" '{print $1/x}' > meandepthpersite
gnuplot << \EOF 
set terminal dumb size 120, 30
set autoscale
set xrange [10:150] 
unset label
set title "Histogram of mean depth per site"
set ylabel "Number of Occurrences"
set xlabel "Mean Depth"
binwidth=1
bin(x,width)=width*floor(x/width) + binwidth/2.0
set xtics 5
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
## 8 : combine all to get new vcf
vcftools --vcf  DP3g95maf05.fil5.vcf --recode-INFO-all --out DP3g95maf05.FIL --max-meanDP 102.5 --exclude-positions DP3g95maf05.fil5.lowQDloci --recode 

# step 11 : remove SNPs for which Ho > 0.6 and Fis < -0.5 or Fis > 0.5
# find to-remove (/ to-keep) SNP positions in R using package Hierfstat

Rscript --vanilla ../filterstep_Ho6_Fis5.R DP3g95maf05.FIL.recode.vcf positions_HoFis_$SPCODE.txt
vcftools --vcf DP3g95maf05.FIL.recode.vcf --positions positions_HoFis_$SPCODE.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis

# step 13 : rename final filtered dataset

cp DP3g95maf05.FIL.HFis.recode.vcf "$SPCODE"_all_filtered.vcf