Commit c21ba18a authored by eboulanger's avatar eboulanger
Browse files

update all

parent 1e09e4ad
......@@ -133,15 +133,16 @@ vcftools --vcf DP3g95maf05.FIL.recode.vcf --positions positions_HoFis_$SPCODE.tx
# 1- calculate individual homozygosity with vcftools
# 2- calculate individual heterozygosity in R
# 3- inspect values and keep only individuals with non-extreme values
# extreme values are defined as those that fall outside of 9 times the interquartal range. (typical outliers are defined as 1.5IQR) this was set arbitrarily after visual inspection, and to allows the same criteria to be applied to both species
# extreme values are defined as those that fall outside of 6 times the interquartal range. (typical outliers are defined as 1.5IQR)
# this was set arbitrarily after visual inspection, and to allow the same criteria to be applied to both species
# calculate individual homozygosity
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --het --out DP3g95maf05.FIL.HFis.indHo
cp DP3g95maf05.FIL.HFis.indHo.het DP3g95maf05.FIL.HFis.indHo.csv
# calculate individual heterozygosity, asses extreme outliers and find individuals to keep
Rscript --vanilla ../filterstep_indHetO_9IQR.R DP3g95maf05.FIL.HFis.indHo.csv indv_HETo_9IQR.txt
Rscript --vanilla ../filterstep_indHetO_9IQR.R DP3g95maf05.FIL.HFis.indHo.csv indv_HETo_6IQR.txt
# remove individuals
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --keep indv_HETo_9IQR.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis.indHet
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --keep indv_HETo_6IQR.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis.indHet
# step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.HFis.indHet.recode.vcf "$SPCODE"_all_filtered_origid.vcf
......
......@@ -28,9 +28,9 @@ hf <- genind2hierfstat(genind)
stats <- basic.stats(hf)
# Filter SNPs for Ho and Fis
loc_H.6<- stats$perloc[which(stats$perloc$Ho<0.6),]
loc_H.6_Fis.5 <- loc_H.6[which(loc_H.6$Fis<0.5 & loc_H.6>-0.5),]
locnames<-rownames(loc_H.6_Fis.5)
loc_H.6 <- stats$perloc[which(stats$perloc$Ho<0.6),]
loc_H.6_Fis.5 <- loc_H.6[which(loc_H.6$Fis<0.5 & loc_H.6$Fis>-0.5),]
locnames <-rownames(loc_H.6_Fis.5)
print(paste("# SNPs to keep =", length(locnames)))
# extract loci to remove from the dataset
......
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
library(gridExtra)
start <- Sys.time()
print(paste("run started at",start))
# Read homozygosity file
dat <- read.csv(args[1], sep = "\t")
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/02-Mullus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/01-Diplodus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
# calculate heterozygosity
dat$O.HET. <- dat$N_SITES - dat$O.HOM.
# in interactive mode: visualise heterozygosity to determine
# - if individuals need to be removed
# - adapted boxplot range
# plot(dat$O.HET.)
# boxplot(dat$O.HET.)
# boxplot(dat$O.HET., range = 9)
# extract non-extreme individuals
OutVals <- boxplot(dat$O.HET., range = 9)$out
pdf(file="plot_indv_HETo.pdf")
plot(dat$O.HET., main = "observed heterozygosity per individual")
boxplot(dat$O.HET., main = "observed heterozygosity per individual")
boxplot(dat$O.HET., range = 6, main = "o het with interquartal range = 6")
# extract extreme individuals
#OutVals <- boxplot(dat$O.HET., range = 9)$out
OutVals <- boxplot(dat$O.HET., range = 6)$out
plot.new()
grid.table(dat[dat$O.HET. %in% OutVals,])
dev.off()
OutVals.ind <- as.character(dat$INDV[which(dat$O.HET. %in% OutVals)])
print(paste("# Individuals to remove =", length(OutVals.ind)))
......@@ -27,7 +35,7 @@ indToKeep <- setdiff(as.character(dat$INDV),OutVals.ind)
print(paste("# Individuals to keep =", length(indToKeep)))
# output individuals list
write.table(indToKeep, args[2], sep = "\t", quote = F, row.names = F, col.names = F)
write.table(indToKeep, args[3], sep = "\t", quote = F, row.names = F, col.names = F)
# duration of script run
end <- Sys.time()
......
......@@ -6,23 +6,10 @@ id <- read.table(paste0("id_",args,".txt"), stringsAsFactors=F)
# correct names
if(args[1] == "mul") {
# mullus
cell <- gsub("_.*","", id$V1)
cell <- paste0("C", cell)
ind <- gsub(".*_","", id$V1)
ind <- gsub("E.*", "", ind)
newid <- paste0(cell,"i", ind)
} else {
# dip
cell <- gsub("_.*","", id$V1)
ind <- gsub(".*i","", id$V1)
ind <- gsub("E.*", "", ind)
newid <- paste0(cell, "i", ind)
}
# check
bothid <- cbind(id, newid)
......
setwd("~/Documents/project_SEACONNECT/seaConnect--dataPrep/02-Bayescan/")
#----- BayeScan -----#
## evaluate model convergence and extract results
## script adapted from http://evomicsorg.wpengine.netdna-cdn.com/wp-content/uploads/2016/01/BayeScan_BayeScEnv_exercises.pdf
# load libraries
library(coda)
library(vcfR)
# set arguments
spCode <- "mul" # species
runNum <- 1 # run number
inputName <- paste0(spCode,"_bayesc_2pop_run",runNum) # 2pop or 3pop analysis (only for mullus)
#vcfPath <- "../01-SNPfilters/02-Mullus/mul_all_filtered.vcf"
#vcfPath <- "../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf"
## model convergence ####
chain1 <- read.table(paste0(inputName,"_ch1.sel"),header=TRUE)
#chain<-chain[-c(1)] # remove first column (step number)
# create a MCMC object with the correct thinning interval
chain1 <- mcmc(chain1,thin=10)
# Plot
plot(chain1)
# Summary statistics
summary(chain1)
# Test correlation between sampled parameters to verify that the sample is large enough
autocorr.diag(chain1)
# Effective size of the sample
effectiveSize(chain1)
# Statistic test for convergence
#### Geweke's convergence diagnistic
#The diagnostic reports the z- scores for each parameter.
#With ?? = 0.05, We reject H0 (equality of means => convergence) if z < -1.96 or z > +1.96.
geweke <- geweke.diag(chain1, frac1=0.1, frac2=0.5)
geweke
which(abs(geweke$z)>1.96) # 0 parameter does not converge
#### Heidelberg and Welch's test
heidel.diag(chain1, eps=0.1, pvalue=0.05) # all parameters passed
#### Test comparing two chains
chain2 <- read.table(paste0(inputName,"_ch2.sel"),header=TRUE)
#chain2<-chain2[-c(1)]
chain2 <- mcmc(chain2,thin=10)
plot(chain2)
combined = mcmc.list(chain1,chain2)
plot(combined)
gelman.diag(combined) # should be close to 1
gelman.plot(combined,ask)
## Bayescan results ####
## Functions defined in the script 'plot_R.r'in the BayeScan folder
results<-plot_bayescan(paste0(inputName,"_ch2_fst.txt"),FDR=0.05)
# detect outliers
outliers_bayescan <-results$outliers # position of outliers
results$nb_outliers #283 outliers for mullus, 126 for diplodus
# extract corresponding SNP names and positions
vcf <- read.vcfR(vcfPath, verbose=F)
loci <- as.data.frame(vcf@fix[,1:2])
outlier_loci <- loci[results$outliers,]
# output positions table
write.table(outlier_loci, file = paste0("outl_pos_bayesc_",spCode,"_run",runNum,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_bayesc_", spCode,"_run", runNum,".txt"), quote = 0)
#----- BayeScan -----#
## produce Bayescan input file
# setwd("~/Documents/project_SEACONNECT/seaConnect--dataPrep/02-Bayescan/")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
#args <- c("../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf","dip")
#args <- c("../01-SNPfilters/02-Mullus/mul_all_filtered.vcf","mul")
start <- Sys.time()
# load libraries
library(vcfR)
library(adegenet)
library(radiator)
# define arguments
inputVcf <- args[1]
speciesCode <- args[2]
outputBayeScan <- paste0(speciesCode,"_bayesc_input_2pop")
# import vcf file to genind
vcfFile <- read.vcfR(inputVcf, verbose = F)
!is.null(vcfFile)
genind <- vcfR2genind(vcfFile)
!is.null(genind)
# use DAPC groups as pop
grp <- find.clusters(genind, max.n.clust = 40, n.pca = round(nrow(genind$tab)/3))
2
grp$size
pop(genind) <- grp$grp
pop(genind) <- paste0("pop", pop(genind))
# convert to bayescan format
tidy_genind <- tidy_genomic_data(genind)
write_bayescan(tidy_genind, pop.select = levels(pop(genind)), filename = outputBayeScan)
end <- Sys.time()
print(paste("run ended at", end))
end - start
# This file is used to plot figures for the software Bayescan in R.
# This program, BayeScan, aims at detecting genetics markers under selection,
# based on allele frequency differences between population.
# Copyright (C) 2010 Matthieu Foll
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# Arguments:
# - file is the name of your file ex: "output_fst.txt"
# - the q-value threshold corresponding to the target False Discovery Rate (FDR)
# - size is the size of the points and text labels for outliers
# - pos is the distance between the points and the labels
# - highlight is a optional list of marker indices to display in red.
# - name_highlighted alows to write the indices of highlighted markers instead of using a point like the other markers
# - add_text adds the indices of the outlier markers
# Output:
# This function returns different paremeters in a list
# - outliers: the list of outliers
# - nb_outliers: the number of outliers
# Typical usage:
# - load this file into R (file/source R code)
# - in R, go to the directory where "output_fst.txt" is (file/change current dir)
# - at the R prompt, type
# > plot_bayescan("output_fst.txt",0,FDR=0.05)
# if you save the output in a variable, you can recall the different results:
# results<-plot_bayescan("output_fst.txt",0,FDR=0.05)
# results$outliers
# results$nb_outliers
#
# plotting posterior distribution is very easy in R with the output of BayeScan:
# first load the output file *.sel produced by BayeScan
# > mydata=read.table("bi.sel",colClasses="numeric")
# choose the parameter you want to plot by setting for example:
# parameter="Fst1"
# then this line will make the plot for:
# > plot(density(mydata[[parameter]]),xlab=parameter,main=paste(parameter,"posterior distribution"))
# you can plot population specific Fst coefficient by setting
# parameter="Fst1"
# if you have non-codominant data you can plot posterior for Fis coefficients in each population:
# parameter="Fis1"
# if you test for selection, you can plot the posterior for alpha coefficient for selection:
# parameter="alpha1"
# you also have access to the likelihood with:
# parameter="logL"
# if you have the package "boa" installed, you can very easily obtain Highest Probability
# Density Interval (HPDI) for your parameter of interest (example for the 95% interval):
# > boa.hpd(mydata[[parameter]],0.05)
plot_bayescan<-function(res,FDR=0.05,size=1,pos=0.35,highlight=NULL,name_highlighted=F,add_text=T)
{
if (is.character(res))
res=read.table(res)
colfstat=5
colq=colfstat-2
highlight_rows=which(is.element(as.numeric(row.names(res)),highlight))
non_highlight_rows=setdiff(1:nrow(res),highlight_rows)
outliers=as.integer(row.names(res[res[,colq]<=FDR,]))
ok_outliers=TRUE
if (sum(res[,colq]<=FDR)==0)
ok_outliers=FALSE;
res[res[,colq]<=0.0001,colq]=0.0001
# plot
plot(log10(res[,colq]),res[,colfstat],xlim=rev(range(log10(res[,colq]))),xlab="log10(q value)",ylab=names(res[colfstat]),type="n")
points(log10(res[non_highlight_rows,colq]),res[non_highlight_rows,colfstat],pch=19,cex=size)
if (name_highlighted) {
if (length(highlight_rows)>0) {
text(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],row.names(res[highlight_rows,]),col="red",cex=size*1.2,font=2)
}
}
else {
points(log10(res[highlight_rows,colq]),res[highlight_rows,colfstat],col="red",pch=19,cex=size)
# add names of loci over p and vertical line
if (ok_outliers & add_text) {
text(log10(res[res[,colq]<=FDR,][,colq])+pos*(round(runif(nrow(res[res[,colq]<=FDR,]),1,2))*2-3),res[res[,colq]<=FDR,][,colfstat],row.names(res[res[,colq]<=FDR,]),cex=size)
}
}
lines(c(log10(FDR),log10(FDR)),c(-1,1),lwd=2)
return(list("outliers"=outliers,"nb_outliers"=length(outliers)))
}
#!/bin/bash
# first run, two chains per run
# PCAdapt detected between two and three populations for mullus so we run the model for both options
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# for diplodus: 2 populations detected
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# second run, two chains per run
# here we try with a higher prior odds
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
# third run
# try again with prior odds = 100, and a greater number of iterations
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
\ No newline at end of file
......@@ -9,11 +9,13 @@ args = commandArgs(trailingOnly=TRUE)
# load libraries
library(pcadapt)
library(qvalue)
library(vcfR)
# define arguments
path_to_file <- args[1]
speciesCode <- args[2]
numPC <- args[3]
# import vcf file to pcadapt format
dat <- read.pcadapt(path_to_file, type = "vcf")
......@@ -21,17 +23,23 @@ dat <- read.pcadapt(path_to_file, type = "vcf")
# choose number K of principal components
x <- pcadapt(input = dat, K = 20)
# set plot output file name
pdf(file=paste0("RPlots_", args[2],".pdf"))
pdf(file=paste0("RPlots_", args[2],"_Kselection.pdf"))
# scree plot of principal components
plot(x, option = "screeplot", K = 10)
# score plot of principal components
plot(x, option = "scores")
plot(x, option = "scores", i = 3, j = 4) # check PC axis 3 and 4
plot(x, option = "scores", i = 1, j = 3)
plot(x, option = "scores", i = 2, j = 3)
dev.off()
# compute outliers detection with K = 4
x <- pcadapt(dat, K = 4)
# compute outliers detection with K = 1 (# of PC to retain) for M. surmuletus.
# K = 1 for D. sargus
x <- pcadapt(dat, K = 1)
summary(x)
pdf(file=paste0("RPlots_", args[2],"_manhattan_qq_hist.pdf"))
# Manhattan plot
#jpeg(filename = paste0(args[2],"_manhattan_pcadpt",".jpeg"))
#jpeg(filename = paste0("Rplots_",args[2],"_manhattan_pcadpt",".jpeg"))
plot(x, option = "manhattan")
#dev.off()
# QQ plot
......@@ -42,13 +50,28 @@ hist(x$pvalues, xlab = "p-values", main = NULL, breaks = 50, col = "orange")
#dev.off()
# histogram of test statistic
plot(x, option = "stat.distribution")
dev.off()
# choose cutoff for outlier detection
# q-values & FDR
qval <- qvalue(x$pvalues)$qvalues
alphaQ <- 0.005
outliersQ <- which(qval < alphaQ)
length(outliersQ)
# Bonferroni correction
padj <- p.adjust(x$pvalues, method="bonferroni")
alpha <- 0.05
outliers <- which(padj < alpha)
print(paste(length(outliers), "outliers detected"),quote=0)
alphaB <- 0.05
outliersB <- which(padj < alphaB)
print(paste(length(outliersB), "outliers detected"),quote=0)
# compare
setdiff(outliersB, outliersQ)
setdiff(outliersQ, outliersB)
# keep the ones inferred from FDR & q-values
outliers <- outliersQ
# extract corresponding SNP names and positions
vcf <- read.vcfR(path_to_file, verbose=F)
......@@ -57,4 +80,4 @@ outlier_loci <- loci[outliers,]
# output positions table
write.table(outlier_loci, file = paste0("outl_pos_pcadpt_",speciesCode,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_pcadpt_",speciesCode), quote = 0)
print(paste0(length(outliers), " outlier positions exported to outl_pos_pcadpt_",speciesCode), quote = 0)
......@@ -4,28 +4,20 @@
inputFile=$1
spCode=$2
runNum=$3
#### adaptive SNPs ####
## step 1: create a final list of all outlier loci positions
cat ../02-Bayescan/outl_pos_bayesc_"$spCode"_"$runNum".txt ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt > "$spCode"_outl_pos_combi.txt
# how many are detected by both methods?
nPosOutDup=`sort "$spCode"_outl_pos_combi.txt | uniq -d | wc -l`
echo ""$nPosOutDup" outlier loci detected with both methods"
# step 1: rename list of outlier positions
cp ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt "$spCode"_outl_pos.txt
# extract total number of unique outlier positions (for file naming)
nPosOut=`sort "$spCode"_outl_pos_combi.txt | uniq -u | wc -l`
nPosOut=`sort "$spCode"_outl_pos.txt | uniq -u | wc -l`
echo ""$nPosOut" unique outlier loci in total"
# remove leading whitespaces (from here: https://www.cyberciti.biz/faq/bash-remove-whitespace-from-string/)
shopt -s extglob # turn it on
nPosOut="${nPosOut##*( )}"
shopt -u extglob # turn it off
# extract file with unique positions
sort "$spCode"_outl_pos_combi.txt | uniq -u > "$spCode"_outl_pos_"$nPosOut".txt
#rm "$spCode"_outl_pos_combi.txt
# add number of outliers to file name
mv "$spCode"_outl_pos.txt "$spCode"_outl_pos_"$nPosOut".txt
## step 2: subset filtered vcf file by outlier positions
......
......@@ -39,44 +39,6 @@ bash filtering.sh ../00-rawData/02-Mullus/mullus.vcf 02-Mullus mul
The final step consists of removing individuals with (extreme) outlier heterozygosity values. Here we define outliers as falling outside 6 * interquartile range.
This theshold value is set in accordance to our data and it is advised to look at the outputted figures to validate this choice for your data.
### SNP filtering results for Mullus surmuletus
| filtering step | filter for | individuals retained | SNPs retained | run time (sec) | output |
| -------------- | ------------------------------------------------- | :------------------: | :-----------: | :------------: | ------ |
| step 0 | ddocent output data | 526 | | | mullus.vcf
| step 1 | call below 50%, mac < 3, min quality score = 30 | 526 | 549881 | 71.00 | mullus.g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 482 | 549881 | 74.00 | mullus.g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 482 | 549881 | 70.00 | mullus.g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 482 | 3657 | 14.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 482 | 3421 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | 482 | **SKIP** | |
| step 7 | ration mapping qualities ref vs alternate alleles | 482 | 3284 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 482 | 3284 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 482 | 3263 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 482 | 2857 | |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 482 | 2794 | 25 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | 482 | 2794 | 23.00 | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | mul_all_filtered_origid.vcf
### SNP filtering results for Diplodus sargus
| filtering step | filter for | individuals retained | SNPs retained | run time (sec) | output |
| -------------- | ------------------------------------------------- | :------------------: | :-----------: | :------------: | ------ |
| step 0 | ddocent output data | 297 | 13362 | | sar_ddocent_.vcf
| step 1 | call below 50%, mac < 3, min quality score = 30 | 297 | 13362 | 14.00 | g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 297 | 13362 | 14.00 | g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 297 | 13362 | 16.00 | g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 297 | 10389 | 11.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 297 | 10202 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | | **SKIP** | | SKIP
| step 7 | ration mapping qualities ref vs alternate alleles | 297 | 9689 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 297 | 9689 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 297 | 9688 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8325 | 11.00 |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | 8206 | 27 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | 297 | 8206 | | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | dip_all_filtered_origid.vcf
### rename individuals for conventional naming system and create population map
this is adapted for our case, review R script or skip according to naming needs
```
......@@ -89,7 +51,7 @@ output:
- mul_all_filtered.vcf
- mul_population_map_467ind.txt
## 03-PCAdapt
## 02-PCAdapt
detect outliers with [PCAdapt](https://bcm-uga.github.io/pcadapt/articles/pcadapt.html) in
R and retrieve loci positions from vcf file
......@@ -123,7 +85,7 @@ wc -l outl_pos_pcadpt_dip.txt
```
413 outliers detected for diplodus
## 04-finalPrep
## 03-finalPrep
Final steps to get neutral and adaptive SNP sets and correct file formats for both species.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment