Commit cf783ade authored by eboulanger's avatar eboulanger
Browse files

PCAdapt scripts

parent ddaf119e
*.txt
*.pdf
*.jpeg
00-rawData
01-SNPfilters/01-Diplodus/
......
......@@ -15,8 +15,8 @@
| step 9 | remove sites quality score < 1/4 depth | | 17546 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 424 | 15466 | |
| step 11 | HWE 0.000001 | 424 | 14532 | 26.00 | DP3g95maf05.FIL.hwe.recode.vcf
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 424 | | |
| step 13 | rename | | | | mullus_all_filtered.vcf
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 424 | 14530 | 25 min |
| step 13 | rename | | | | mullus_all_filtered.vcf
# SNP filtering results for Diplodus sargus
### ddocent pipeline + additions
......@@ -27,13 +27,13 @@
| step 1 | call below 50%, mac < 3, min quality score = 30 | 297 | 13362 | 14.00 | diplodus.g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 297 | 13362 | 14.00 | diplodus.g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 297 | 13362 | 16.00 | diplodus.g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 297 | 10342 | 11.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 297 | 10155 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | 297 | 9899 | |
| step 7 | ration mapping qualities ref vs alternate alleles | 297 | 9399 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 297 | 9399 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 297 | 9398 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8077 |11.00 |
| step 11 | HWE 0.000001 | 297 | 7761 | 7.00 | DP3g95maf05.FIL.hwe.recode.vcf
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | | |
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 297 | 10389 | 11.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 297 | 10202 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | SKIP | **SKIP** | | SKIP
| step 7 | ration mapping qualities ref vs alternate alleles | 297 | 9689 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 297 | 9689 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 297 | 9688 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8325 | 11.00 |
| step 11 | HWE 0.000001 | 297 | 7978 | 7.00 | DP3g95maf05.FIL.hwe.recode.vcf
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | 7973 | 8 min |
| step 13 | rename | | | | diplodus_all_filtered.vcf
\ No newline at end of file
......@@ -52,7 +52,7 @@ vcftools --vcf diplodus.g5mac3dp3.recode.vcf --remove lowDP.indv --recode --reco
# step 4 : restrict data to variants called in a high percentage of individuals and filter by mean depth of genotypes
vcftools --vcf diplodus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 15
vcftools --vcf diplodus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 5
###
# FreeBayes - specific filters
......@@ -63,18 +63,19 @@ awk '/#/' DP3g95maf05.recode.vcf
# step 5 : filter for allele balance
# this requires vcflib
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95p5maf05.recode.vcf > DP3g95p5maf05.fil1.vcf
# population level: vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95p5maf05.recode.vcf > DP3g95p5maf05.fil1.vcf
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95maf05.recode.vcf > DP3g95maf05.fil1.vcf
# check how many were removed
awk '!/#/' DP3g95maf05.recode.vcf | wc -l
awk '!/#/' DP3g95maf05.fil1.vcf | wc -l
# step 6 : filter out sites that have reads from both strands
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
# awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# SKIP THIS STEP BECAUSE REMOVES TOO MANY SNPs FOR MULLUS
# step 7 : look at ration of mapping qualities between reference and alternate alleles
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" DP3g95maf05.fil2.vcf > DP3g95maf05.fil3.vcf
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" DP3g95maf05.fil1.vcf > DP3g95maf05.fil3.vcf
awk '!/#/' DP3g95maf05.fil3.vcf | wc -l
# step 8 : check whether there is discrepancy in the properly paired status (SNAP IK NIET MAAR MAAKT BLIJKBAAR NIET UIT WANT GEEN SNPs WEGGEFILTERD)
......@@ -82,7 +83,7 @@ vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR
awk '!/#/' DP3g95maf05.fil4.vcf | wc -l
# step 9 : remove any locus that has a quality score below 1/4 of the depth
# (read suggested blogposts to understand why) -> removes a lot of SNPs so maybe skip?
# (read suggested blogposts to understand why)
vcffilter -f "QUAL / DP > 0.25" DP3g95maf05.fil4.vcf > DP3g95maf05.fil5.vcf
awk '!/#/' DP3g95maf05.fil5.vcf | wc -l
......@@ -134,7 +135,7 @@ vcftools --vcf DP3g95maf05.FIL.hwe.recode.vcf --positions positions_HoFis_dip.tx
# step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.hwe.recode.vcf mullus_all_filtered.vcf
cp DP3g95maf05.FIL.hwe.recode.vcf diplodus_all_filtered.vcf
# step 14 : outliers
......@@ -6,7 +6,6 @@ cd ~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/
mkdir 02-Mullus
cd 02-Mullus
FILE=../../00-rawData/02-Mullus/mullus.vcf
PATH=$PATH:~/programmes/vcflib/bin/
......@@ -53,7 +52,7 @@ vcftools --vcf mullus.g5mac3dp3.recode.vcf --remove lowDP.indv --recode --recode
# step 4 : restrict data to variants called in a high percentage of individuals and filter by mean depth of genotypes
vcftools --vcf mullus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 10
vcftools --vcf mullus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 5
###
# FreeBayes - specific filters
......@@ -71,15 +70,15 @@ awk '!/#/' DP3g95maf05.recode.vcf | wc -l
awk '!/#/' DP3g95maf05.fil1.vcf | wc -l
# step 6 : filter out sites that have reads from both strands
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# SKIPPED STEP 6 FOR NOW
# vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
# awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# ONLY RETAINS 6 SNPS FOR MULLUS. SKIP THIS STEP FOR BOTH SPECIES.
# step 7 : look at ration of mapping qualities between reference and alternate alleles
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" DP3g95maf05.fil1.vcf > DP3g95maf05.fil3.vcf
awk '!/#/' DP3g95maf05.fil3.vcf | wc -l
# step 8 : check whether there is discrepancy in the properly paired status (SNAP IK NIET MAAR MAAKT BLIJKBAAR NIET UIT WANT GEEN SNPs WEGGEFILTERD)
# step 8 : check whether there is discrepancy in the properly paired status
vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s DP3g95maf05.fil3.vcf > DP3g95maf05.fil4.vcf
awk '!/#/' DP3g95maf05.fil4.vcf | wc -l
......
......@@ -2,17 +2,17 @@ SNP filtering results for Diplodus sargus
ddocent pipeline + additions
filtering step filter for individuals retained SNPs retained run time (sec) output
step 0 ddocent output data 297 13362
step 1 call below 50%, mac < 3, min quality score = 30 297 13362 14.00
step 2 min mean depth genotype call = 3 297 13362 14.00
step 3 remove individuals > 50% missing data 297 13362 16.00
step 4 remove sites > 5% missing data, maf 0.05, min meanDP = 15 297 10342 11.00
step 5 filter for allele balance 297 10155
step 6 filter out sites with reads from both strands 297 9899
step 7 ration of mapping qualities reference vs alternate alleles 297 9399
step 8 paired status 297 9399
step 9 remove sites quality score < 1/4 depth 297 9398
step 10 depth x quality score cutoff 297 8077 11.00
step 11 HWE 0.000001 297 7761 7.00
step 12 He
step 13 Fis
\ No newline at end of file
step 0 ddocent output data 297 13362 sar_ddocent.vcf
step 1 call below 50%, mac < 3, min quality score = 30 297 13362 14.00 diplodus.g5mac3.recode.vcf
step 2 min mean depth genotype call = 3 297 13362 14.00 diplodus.g5mac3dp3.recode.vcf
step 3 remove individuals > 50% missing data 297 13362 16.00 diplodus.g5mac3dplm.recode.vcf
step 4 remove sites > 5% missing data, maf 0.05, min meanDP = 5 297 10389 11.00 DP3g95maf05.recode.vcf
step 5 filter for allele balance 297 10202 DP3g95maf05.fil1.vcf
step 6 filter out sites with reads from both strands SKIP SKIP SKIP
step 7 ration of mapping qualities reference vs alternate alleles 297 9689 DP3g95maf05.fil3.vcf
step 8 paired status 297 9689 DP3g95maf05.fil4.vcf
step 9 remove sites quality score < 1/4 depth 297 9688 DP3g95maf05.fil5.vcf
step 10 depth x quality score cutoff 297 8325 11.00 DP3g95maf05.FIL.recode.vcf
step 11 HWE 0.000001 297 7978 7.00 DP3g95maf05.FIL.hwe.recode.vcf
step 12 He > 0.6 & Fis > 0.5 & Fis < -0.5 297 7973 8 min DP3g95maf05.FIL.hwe.HFis.recode.vcf
step 13 rename diplodus_all_filtered.vcf
\ No newline at end of file
......@@ -7,12 +7,12 @@ step 1 call below 50%, mac < 3, min quality score = 30 431 49027 71.00 mullus.g5
step 2 min mean depth genotype call = 3 431 49027 74.00 mullus.g5mac3dp3.recode.vcf
step 3 remove individuals > 50% missing data 424 49027 70.00 mullus.g5mac3dplm.recode.vcf
step 4 remove sites > 5% missing data, maf 0.05, min mean DP = 5 424 18727 14.00 DP3g95maf05.recode.vcf
step 5 filter for allele balance 17965 DP3g95maf05.fil1.vcf
step 6 filter out sites with reads from both strands SKIP
step 5 filter for allele balance 424 17965 DP3g95maf05.fil1.vcf
step 6 filter out sites with reads from both strands SKIP SKIP SKIP
step 7 ration of mapping qualities reference vs alternate alleles 17546 DP3g95maf05.fil3.vcf
step 8 paired status 17546 DP3g95maf05.fil4.vcf
step 9 remove sites quality score < 1/4 depth DP3g95maf05.fil5.vcf
step 10 depth x quality score cutoff
step 11 HWE 0.000001
step 12 He & Fis 5063 191 DP3g95maf05.FIL.hwe.Hfis.recode.vcf
step 9 remove sites quality score < 1/4 depth 17546 DP3g95maf05.fil5.vcf
step 10 depth x quality score cutoff 15466 DP3g95maf05.FIL.recode.vcf
step 11 HWE 0.000001 14532 DP3g95maf05.FIL.hwe.recode.vcf
step 12 He > 0.6 & Fis > 0.5 & Fis < -0.5 14530 25 min DP3g95maf05.FIL.hwe.Hfis.recode.vcf
step 13 rename mullus_all_filtered.vcf
\ No newline at end of file
#----- PCAdapt -----#
# vignette: https://cran.r-project.org/web/packages/pcadapt/vignettes/pcadapt.html
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# load libraries
library(pcadapt)
library(vcfR)
# import vcf file to pcadapt format
path_to_file <- args[1]
dat <- read.pcadapt(path_to_file, type = "vcf")
# choose number K of principal components
x <- pcadapt(input = dat, K = 20)
# set plot output file name
pdf(file=paste0("RPlots_", args[2],".pdf"))
# scree plot of principal components
plot(x, option = "screeplot", K = 10)
# score plot of principal components
plot(x, option = "scores")
plot(x, option = "scores", i = 3, j = 4) # check PC axis 3 and 4
# compute outliers detection with K = 4
x <- pcadapt(dat, K = 4)
# Manhattan plot
#jpeg(filename = paste0(args[2],"_manhattan_pcadpt",".jpeg"))
plot(x, option = "manhattan")
#dev.off()
# QQ plot
plot(x , option = "qqplot", threshold = 0.05)
# histogram of p-values
#jpeg(filename = paste0(args[2],"_histP_pcadpt",".jpeg"))
hist(x$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange")
#dev.off()
# histogram of test statistic
plot(x, option = "stat.distribution")
# choose cutoff for outlier detection
# Bonferroni correction
padj <- p.adjust(x$pvalues, method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
print(paste(length(outliers), "outliers detected"),quote=0)
# extract corresponding SNP names and positions
vcf <- read.vcfR(path_to_file, verbose=F)
loci <- as.data.frame(vcf@fix[,1:2])
outlier_loci <- loci[outliers,]
outlier_positions <- outlier_loci$POS
# output positions table
write.table(outlier_positions, file = paste0("outl_pos_pcadpt_",args[2],".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_pcadpt_", args[2]), quote = 0)
# detect outliers with PCAdapt in R and retrieve loci positions from vcf file
### Run the PCAdapt.R script
This script will load your vcf file, detect outlier loci using the package pcadapt and
output a list of outlier loci positions that can be used to subset your vcf file
set arguments : argument 1 = input file (vcf), argument 2 = species code
#### for mullus :
```
Rscript --vanilla PCAdapt.R ../01-SNPfilters/02-Mullus/mullus_all_filtered.vcf mul
```
see how many outliers were detected :
```
wc -l outl_pos_pcadpt_mul.txt
```
2078 outliers detected for mullus
#### for diplodus :
```
Rscript --vanilla PCAdapt.R ../01-SNPfilters/01-Diplodus/diplodus_all_filtered.vcf dip
wc -l outl_pos_pcadpt_dip.txt
```
289 outliers detected for diplodus
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment