Commit c21ba18a authored by eboulanger's avatar eboulanger
Browse files

update all

parent 1e09e4ad
...@@ -133,15 +133,16 @@ vcftools --vcf DP3g95maf05.FIL.recode.vcf --positions positions_HoFis_$SPCODE.tx ...@@ -133,15 +133,16 @@ vcftools --vcf DP3g95maf05.FIL.recode.vcf --positions positions_HoFis_$SPCODE.tx
# 1- calculate individual homozygosity with vcftools # 1- calculate individual homozygosity with vcftools
# 2- calculate individual heterozygosity in R # 2- calculate individual heterozygosity in R
# 3- inspect values and keep only individuals with non-extreme values # 3- inspect values and keep only individuals with non-extreme values
# extreme values are defined as those that fall outside of 9 times the interquartal range. (typical outliers are defined as 1.5IQR) this was set arbitrarily after visual inspection, and to allows the same criteria to be applied to both species # extreme values are defined as those that fall outside of 6 times the interquartal range. (typical outliers are defined as 1.5IQR)
# this was set arbitrarily after visual inspection, and to allow the same criteria to be applied to both species
# calculate individual homozygosity # calculate individual homozygosity
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --het --out DP3g95maf05.FIL.HFis.indHo vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --het --out DP3g95maf05.FIL.HFis.indHo
cp DP3g95maf05.FIL.HFis.indHo.het DP3g95maf05.FIL.HFis.indHo.csv cp DP3g95maf05.FIL.HFis.indHo.het DP3g95maf05.FIL.HFis.indHo.csv
# calculate individual heterozygosity, asses extreme outliers and find individuals to keep # calculate individual heterozygosity, asses extreme outliers and find individuals to keep
Rscript --vanilla ../filterstep_indHetO_9IQR.R DP3g95maf05.FIL.HFis.indHo.csv indv_HETo_9IQR.txt Rscript --vanilla ../filterstep_indHetO_9IQR.R DP3g95maf05.FIL.HFis.indHo.csv indv_HETo_6IQR.txt
# remove individuals # remove individuals
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --keep indv_HETo_9IQR.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis.indHet vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --keep indv_HETo_6IQR.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis.indHet
# step 13 : rename final filtered dataset # step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.HFis.indHet.recode.vcf "$SPCODE"_all_filtered_origid.vcf cp DP3g95maf05.FIL.HFis.indHet.recode.vcf "$SPCODE"_all_filtered_origid.vcf
......
...@@ -28,9 +28,9 @@ hf <- genind2hierfstat(genind) ...@@ -28,9 +28,9 @@ hf <- genind2hierfstat(genind)
stats <- basic.stats(hf) stats <- basic.stats(hf)
# Filter SNPs for Ho and Fis # Filter SNPs for Ho and Fis
loc_H.6<- stats$perloc[which(stats$perloc$Ho<0.6),] loc_H.6 <- stats$perloc[which(stats$perloc$Ho<0.6),]
loc_H.6_Fis.5 <- loc_H.6[which(loc_H.6$Fis<0.5 & loc_H.6>-0.5),] loc_H.6_Fis.5 <- loc_H.6[which(loc_H.6$Fis<0.5 & loc_H.6$Fis>-0.5),]
locnames<-rownames(loc_H.6_Fis.5) locnames <-rownames(loc_H.6_Fis.5)
print(paste("# SNPs to keep =", length(locnames))) print(paste("# SNPs to keep =", length(locnames)))
# extract loci to remove from the dataset # extract loci to remove from the dataset
......
#!/usr/bin/env Rscript #!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE) args = commandArgs(trailingOnly=TRUE)
library(gridExtra)
start <- Sys.time() start <- Sys.time()
print(paste("run started at",start)) print(paste("run started at",start))
# Read homozygosity file # Read homozygosity file
dat <- read.csv(args[1], sep = "\t") dat <- read.csv(args[1], sep = "\t")
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/02-Mullus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/01-Diplodus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
# calculate heterozygosity # calculate heterozygosity
dat$O.HET. <- dat$N_SITES - dat$O.HOM. dat$O.HET. <- dat$N_SITES - dat$O.HOM.
# in interactive mode: visualise heterozygosity to determine # in interactive mode: visualise heterozygosity to determine
# - if individuals need to be removed # - if individuals need to be removed
# - adapted boxplot range # - adapted boxplot range
# plot(dat$O.HET.) pdf(file="plot_indv_HETo.pdf")
# boxplot(dat$O.HET.) plot(dat$O.HET., main = "observed heterozygosity per individual")
# boxplot(dat$O.HET., range = 9) boxplot(dat$O.HET., main = "observed heterozygosity per individual")
boxplot(dat$O.HET., range = 6, main = "o het with interquartal range = 6")
# extract non-extreme individuals
OutVals <- boxplot(dat$O.HET., range = 9)$out # extract extreme individuals
#OutVals <- boxplot(dat$O.HET., range = 9)$out
OutVals <- boxplot(dat$O.HET., range = 6)$out
plot.new()
grid.table(dat[dat$O.HET. %in% OutVals,])
dev.off()
OutVals.ind <- as.character(dat$INDV[which(dat$O.HET. %in% OutVals)]) OutVals.ind <- as.character(dat$INDV[which(dat$O.HET. %in% OutVals)])
print(paste("# Individuals to remove =", length(OutVals.ind))) print(paste("# Individuals to remove =", length(OutVals.ind)))
...@@ -27,7 +35,7 @@ indToKeep <- setdiff(as.character(dat$INDV),OutVals.ind) ...@@ -27,7 +35,7 @@ indToKeep <- setdiff(as.character(dat$INDV),OutVals.ind)
print(paste("# Individuals to keep =", length(indToKeep))) print(paste("# Individuals to keep =", length(indToKeep)))
# output individuals list # output individuals list
write.table(indToKeep, args[2], sep = "\t", quote = F, row.names = F, col.names = F) write.table(indToKeep, args[3], sep = "\t", quote = F, row.names = F, col.names = F)
# duration of script run # duration of script run
end <- Sys.time() end <- Sys.time()
......
...@@ -6,23 +6,10 @@ id <- read.table(paste0("id_",args,".txt"), stringsAsFactors=F) ...@@ -6,23 +6,10 @@ id <- read.table(paste0("id_",args,".txt"), stringsAsFactors=F)
# correct names # correct names
if(args[1] == "mul") {
# mullus
cell <- gsub("_.*","", id$V1)
cell <- paste0("C", cell)
ind <- gsub(".*_","", id$V1)
ind <- gsub("E.*", "", ind)
newid <- paste0(cell,"i", ind)
} else {
# dip
cell <- gsub("_.*","", id$V1) cell <- gsub("_.*","", id$V1)
ind <- gsub(".*i","", id$V1) ind <- gsub(".*i","", id$V1)
ind <- gsub("E.*", "", ind) ind <- gsub("E.*", "", ind)
newid <- paste0(cell, "i", ind) newid <- paste0(cell, "i", ind)
}
# check # check
bothid <- cbind(id, newid) bothid <- cbind(id, newid)
......
setwd("~/Documents/project_SEACONNECT/seaConnect--dataPrep/02-Bayescan/")
#----- BayeScan -----#
## evaluate model convergence and extract results
## script adapted from http://evomicsorg.wpengine.netdna-cdn.com/wp-content/uploads/2016/01/BayeScan_BayeScEnv_exercises.pdf
# load libraries
library(coda)
library(vcfR)
# set arguments
spCode <- "mul" # species
runNum <- 1 # run number
inputName <- paste0(spCode,"_bayesc_2pop_run",runNum) # 2pop or 3pop analysis (only for mullus)
#vcfPath <- "../01-SNPfilters/02-Mullus/mul_all_filtered.vcf"
#vcfPath <- "../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf"
## model convergence ####
chain1 <- read.table(paste0(inputName,"_ch1.sel"),header=TRUE)
#chain<-chain[-c(1)] # remove first column (step number)
# create a MCMC object with the correct thinning interval
chain1 <- mcmc(chain1,thin=10)
# Plot
plot(chain1)
# Summary statistics
summary(chain1)
# Test correlation between sampled parameters to verify that the sample is large enough
autocorr.diag(chain1)
# Effective size of the sample
effectiveSize(chain1)
# Statistic test for convergence
#### Geweke's convergence diagnistic
#The diagnostic reports the z- scores for each parameter.
#With ?? = 0.05, We reject H0 (equality of means => convergence) if z < -1.96 or z > +1.96.
geweke <- geweke.diag(chain1, frac1=0.1, frac2=0.5)
geweke
which(abs(geweke$z)>1.96) # 0 parameter does not converge
#### Heidelberg and Welch's test
heidel.diag(chain1, eps=0.1, pvalue=0.05) # all parameters passed
#### Test comparing two chains
chain2 <- read.table(paste0(inputName,"_ch2.sel"),header=TRUE)
#chain2<-chain2[-c(1)]
chain2 <- mcmc(chain2,thin=10)
plot(chain2)
combined = mcmc.list(chain1,chain2)
plot(combined)
gelman.diag(combined) # should be close to 1
gelman.plot(combined,ask)
## Bayescan results ####
## Functions defined in the script 'plot_R.r'in the BayeScan folder
results<-plot_bayescan(paste0(inputName,"_ch2_fst.txt"),FDR=0.05)
# detect outliers
outliers_bayescan <-results$outliers # position of outliers
results$nb_outliers #283 outliers for mullus, 126 for diplodus
# extract corresponding SNP names and positions
vcf <- read.vcfR(vcfPath, verbose=F)
loci <- as.data.frame(vcf@fix[,1:2])
outlier_loci <- loci[results$outliers,]
# output positions table
write.table(outlier_loci, file = paste0("outl_pos_bayesc_",spCode,"_run",runNum,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_bayesc_", spCode,"_run", runNum,".txt"), quote = 0)
#----- BayeScan -----#
## produce Bayescan input file
# setwd("~/Documents/project_SEACONNECT/seaConnect--dataPrep/02-Bayescan/")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
#args <- c("../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf","dip")
#args <- c("../01-SNPfilters/02-Mullus/mul_all_filtered.vcf","mul")
start <- Sys.time()
# load libraries
library(vcfR)
library(adegenet)
library(radiator)
# define arguments
inputVcf <- args[1]
speciesCode <- args[2]
outputBayeScan <- paste0(speciesCode,"_bayesc_input_2pop")
# import vcf file to genind
vcfFile <- read.vcfR(inputVcf, verbose = F)
!is.null(vcfFile)
genind <- vcfR2genind(vcfFile)
!is.null(genind)
# use DAPC groups as pop
grp <- find.clusters(genind, max.n.clust = 40, n.pca = round(nrow(genind$tab)/3))
2
grp$size
pop(genind) <- grp$grp
pop(genind) <- paste0("pop", pop(genind))
# convert to bayescan format
tidy_genind <- tidy_genomic_data(genind)
write_bayescan(tidy_genind, pop.select = levels(pop(genind)), filename = outputBayeScan)
end <- Sys.time()
print(paste("run ended at", end))
end - start
# This file is used to plot figures for the software Bayescan in R.
# This program, BayeScan, aims at detecting genetics markers under selection,
# based on allele frequency differences between population.
# Copyright (C) 2010 Matthieu Foll
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Arguments:
# - file is the name of your file ex: "output_fst.txt"
# - the q-value threshold corresponding to the target False Discovery Rate (FDR)
# - size is the size of the points and text labels for outliers
# - pos is the distance between the points and the labels
# - highlight is a optional list of marker indices to display in red.
# - name_highlighted alows to write the indices of highlighted markers instead of using a point like the other markers
# - add_text adds the indices of the outlier markers
# Output:
# This function returns different paremeters in a list
# - outliers: the list of outliers
# - nb_outliers: the number of outliers
# Typical usage:
# - load this file into R (file/source R code)
# - in R, go to the directory where "output_fst.txt" is (file/change current dir)
# - at the R prompt, type
# > plot_bayescan("output_fst.txt",0,FDR=0.05)
# if you save the output in a variable, you can recall the different results:
# results<-plot_bayescan("output_fst.txt",0,FDR=0.05)
# results$outliers
# results$nb_outliers
#
# plotting posterior distribution is very easy in R with the output of BayeScan:
# first load the output file *.sel produced by BayeScan
# > mydata=read.table("bi.sel",colClasses="numeric")
# choose the parameter you want to plot by setting for example:
# parameter="Fst1"
# then this line will make the plot for:
# > plot(density(mydata[[parameter]]),xlab=parameter,main=paste(parameter,"posterior distribution"))
# you can plot population specific Fst coefficient by setting
# parameter="Fst1"
# if you have non-codominant data you can plot posterior for Fis coefficients in each population:
# parameter="Fis1"
# if you test for selection, you can plot the posterior for alpha coefficient for selection:
# parameter="alpha1"
# you also have access to the likelihood with:
# parameter="logL"
# if you have the package "boa" installed, you can very easily obtain Highest Probability
# Density Interval (HPDI) for your parameter of interest (example for the 95% interval):
# > boa.hpd(mydata[[parameter]],0.05)
plot_bayescan<-function(res,FDR=0.05,size=1,pos=0.35,highlight=NULL,name_highlighted=F,add_text=T)
{
if (is.character(res))
res=read.table(res)
colfstat=5
colq=colfstat-2
highlight_rows=which(is.element(as.numeric(row.names(res)),highlight))
non_highlight_rows=setdiff(1:nrow(res),highlight_rows)
outliers=as.integer(row.names(res[res[,colq]<=FDR,]))
ok_outliers=TRUE
if (sum(res[,colq]<=FDR)==0)
ok_outliers=FALSE;
res[res[,colq]<=0.0001,colq]=0.0001
# plot
plot(log10(res[,colq]),res[,colfstat],xlim=rev(range(log10(res[,colq]))),xlab="log10(q value)",ylab=names(res[colfstat]),type="n")
points(log10(res[non_highlight_rows,colq]),res[non_highlight_rows,colfstat],pch=19,cex=size)
if (name_highlighted) {
if (length(highlight_rows)>0) {
text(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],row.names(res[highlight_rows,]),col="red",cex=size*1.2,font=2)
}
}
else {
points(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],col="red",pch=19,cex=size)
# add names of loci over p and vertical line
if (ok_outliers & add_text) {
text(log10(res[res[,colq]<=FDR,][,colq])+pos*(round(runif(nrow(res[res[,colq]<=FDR,]),1,2))*2-3),res[res[,colq]<=FDR,][,colfstat],row.names(res[res[,colq]<=FDR,]),cex=size)
}
}
lines(c(log10(FDR),log10(FDR)),c(-1,1),lwd=2)
return(list("outliers"=outliers,"nb_outliers"=length(outliers)))
}
#!/bin/bash
# first run, two chains per run
# PCAdapt detected between two and three populations for mullus so we run the model for both options
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# for diplodus: 2 populations detected
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# second run, two chains per run
# here we try with a higher prior odds
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
# third run
# try again with prior odds = 100, and a greater number of iterations
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
\ No newline at end of file
...@@ -9,11 +9,13 @@ args = commandArgs(trailingOnly=TRUE) ...@@ -9,11 +9,13 @@ args = commandArgs(trailingOnly=TRUE)
# load libraries # load libraries
library(pcadapt) library(pcadapt)
library(qvalue)
library(vcfR) library(vcfR)
# define arguments # define arguments
path_to_file <- args[1] path_to_file <- args[1]
speciesCode <- args[2] speciesCode <- args[2]
numPC <- args[3]
# import vcf file to pcadapt format # import vcf file to pcadapt format
dat <- read.pcadapt(path_to_file, type = "vcf") dat <- read.pcadapt(path_to_file, type = "vcf")
...@@ -21,17 +23,23 @@ dat <- read.pcadapt(path_to_file, type = "vcf") ...@@ -21,17 +23,23 @@ dat <- read.pcadapt(path_to_file, type = "vcf")
# choose number K of principal components # choose number K of principal components
x <- pcadapt(input = dat, K = 20) x <- pcadapt(input = dat, K = 20)
# set plot output file name # set plot output file name
pdf(file=paste0("RPlots_", args[2],".pdf")) pdf(file=paste0("RPlots_", args[2],"_Kselection.pdf"))
# scree plot of principal components # scree plot of principal components
plot(x, option = "screeplot", K = 10) plot(x, option = "screeplot", K = 10)
# score plot of principal components # score plot of principal components
plot(x, option = "scores") plot(x, option = "scores")
plot(x, option = "scores", i = 3, j = 4) # check PC axis 3 and 4 plot(x, option = "scores", i = 3, j = 4) # check PC axis 3 and 4
plot(x, option = "scores", i = 1, j = 3)
plot(x, option = "scores", i = 2, j = 3)
dev.off()
# compute outliers detection with K = 4 # compute outliers detection with K = 1 (# of PC to retain) for M. surmuletus.
x <- pcadapt(dat, K = 4) # K = 1 for D. sargus
x <- pcadapt(dat, K = 1)
summary(x)
pdf(file=paste0("RPlots_", args[2],"_manhattan_qq_hist.pdf"))
# Manhattan plot # Manhattan plot
#jpeg(filename = paste0(args[2],"_manhattan_pcadpt",".jpeg")) #jpeg(filename = paste0("Rplots_",args[2],"_manhattan_pcadpt",".jpeg"))
plot(x, option = "manhattan") plot(x, option = "manhattan")
#dev.off() #dev.off()
# QQ plot # QQ plot
...@@ -42,13 +50,28 @@ hist(x$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange") ...@@ -42,13 +50,28 @@ hist(x$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange")
#dev.off() #dev.off()
# histogram of test statistic # histogram of test statistic
plot(x, option = "stat.distribution") plot(x, option = "stat.distribution")
dev.off()
# choose cutoff for outlier detection # choose cutoff for outlier detection
# q-values & FDR
qval <- qvalue(x$pvalues)$qvalues
alphaQ <- 0.005
outliersQ <- which(qval < alphaQ)
length(outliersQ)
# Bonferroni correction # Bonferroni correction
padj <- p.adjust(x$pvalues, method="bonferroni") padj <- p.adjust(x$pvalues, method="bonferroni")
alpha <- 0.05 alphaB <- 0.05
outliers <- which(padj < alpha) outliersB <- which(padj < alphaB)
print(paste(length(outliers), "outliers detected"),quote=0) print(paste(length(outliersB), "outliers detected"),quote=0)
# compare
setdiff(outliersB, outliersQ)
setdiff(outliersQ, outliersB)
# keep the ones inferred from FDR & q-values
outliers <- outliersQ
# extract corresponding SNP names and positions # extract corresponding SNP names and positions
vcf <- read.vcfR(path_to_file, verbose=F) vcf <- read.vcfR(path_to_file, verbose=F)
...@@ -57,4 +80,4 @@ outlier_loci <- loci[outliers,] ...@@ -57,4 +80,4 @@ outlier_loci <- loci[outliers,]
# output positions table # output positions table
write.table(outlier_loci, file = paste0("outl_pos_pcadpt_",speciesCode,".txt"), sep = "\t", quote = F, row.names = F, col.names = F) write.table(outlier_loci, file = paste0("outl_pos_pcadpt_",speciesCode,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_pcadpt_",speciesCode), quote = 0) print(paste0(length(outliers), " outlier positions exported to outl_pos_pcadpt_",speciesCode), quote = 0)
...@@ -4,28 +4,20 @@ ...@@ -4,28 +4,20 @@
inputFile=$1 inputFile=$1
spCode=$2 spCode=$2
runNum=$3
#### adaptive SNPs #### #### adaptive SNPs ####
## step 1: create a final list of all outlier loci positions # step 1: rename list of outlier positions
cp ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt "$spCode"_outl_pos.txt
cat ../02-Bayescan/outl_pos_bayesc_"$spCode"_"$runNum".txt ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt > "$spCode"_outl_pos_combi.txt
# how many are detected by both methods?
nPosOutDup=`sort "$spCode"_outl_pos_combi.txt | uniq -d | wc -l`
echo ""$nPosOutDup" outlier loci detected with both methods"
# extract total number of unique outlier positions (for file naming) # extract total number of unique outlier positions (for file naming)
nPosOut=`sort "$spCode"_outl_pos_combi.txt | uniq -u | wc -l` nPosOut=`sort "$spCode"_outl_pos.txt | uniq -u | wc -l`
echo ""$nPosOut" unique outlier loci in total" echo ""$nPosOut" unique outlier loci in total"
# remove leading whitespaces (from here: https://www.cyberciti.biz/faq/bash-remove-whitespace-from-string/) # remove leading whitespaces (from here: https://www.cyberciti.biz/faq/bash-remove-whitespace-from-string/)
shopt -s extglob # turn it on shopt -s extglob # turn it on
nPosOut="${nPosOut##*( )}" nPosOut="${nPosOut##*( )}"
shopt -u extglob # turn it off shopt -u extglob # turn it off
# add number of outliers to file name
# extract file with unique positions mv "$spCode"_outl_pos.txt "$spCode"_outl_pos_"$nPosOut".txt
sort "$spCode"_outl_pos_combi.txt | uniq -u > "$spCode"_outl_pos_"$nPosOut".txt
#rm "$spCode"_outl_pos_combi.txt
## step 2: subset filtered vcf file by outlier positions ## step 2: subset filtered vcf file by outlier positions
......
...@@ -39,44 +39,6 @@ bash filtering.sh ../00-rawData/02-Mullus/mullus.vcf 02-Mullus mul ...@@ -39,44 +39,6 @@ bash filtering.sh ../00-rawData/02-Mullus/mullus.vcf 02-Mullus mul
The final step consists of removing individuals with (extreme) outlier heterozygosity values. Here we define outliers as falling outside 6 * interquartile range. The final step consists of removing individuals with (extreme) outlier heterozygosity values. Here we define outliers as falling outside 6 * interquartile range.
This theshold value is set in accordance to our data and it is advised to look at the outputted figures to validate this choice for your data. This theshold value is set in accordance to our data and it is advised to look at the outputted figures to validate this choice for your data.
### SNP filtering results for Mullus surmuletus
| filtering step | filter for | individuals retained | SNPs retained | run time (sec) | output |
| -------------- | ------------------------------------------------- | :------------------: | :-----------: | :------------: | ------ |
| step 0 | ddocent output data | 526 | | | mullus.vcf
| step 1 | call below 50%, mac < 3, min quality score = 30 | 526 | 549881 | 71.00 | mullus.g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 482 | 549881 | 74.00 | mullus.g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 482 | 549881 | 70.00 | mullus.g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 482 | 3657 | 14.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 482 | 3421 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | 482 | **SKIP** | |
| step 7 | ration mapping qualities ref vs alternate alleles | 482 | 3284 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 482 | 3284 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 482 | 3263 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 482 | 2857 | |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 482 | 2794 | 25 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | 482 | 2794 | 23.00 | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | mul_all_filtered_origid.vcf
### SNP filtering results for Diplodus sargus
| filtering step | filter for | individuals retained | SNPs retained | run time (sec) | output |
| -------------- | ------------------------------------------------- | :------------------: | :-----------: | :------------: | ------ |
| step 0 | ddocent output data | 297 | 13362 | | sar_ddocent_.vcf
| step 1 | call below 50%, mac < 3, min quality score = 30 | 297 | 13362 | 14.00 | g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 297 | 13362 | 14.00 | g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 297 | 13362 | 16.00 | g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 297 | 10389 | 11.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 297 | 10202 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | | **SKIP** | | SKIP
| step 7 | ration mapping qualities ref vs alternate alleles | 297 | 9689 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 297 | 9689 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 297 | 9688 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8325 | 11.00 |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | 8206 | 27 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | 297 | 8206 | | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | dip_all_filtered_origid.vcf
### rename individuals for conventional naming system and create population map ### rename individuals for conventional naming system and create population map
this is adapted for our case, review R script or skip according to naming needs this is adapted for our case, review R script or skip according to naming needs