Commit 59a1f911 authored by eboulanger's avatar eboulanger
Browse files

update and add scripts

parent 22131a46
*.txt
*.pdf
*.jpeg
*.tsv
*.sel
*.rad
.DS_Store
.Rhistory
00-rawData
01-SNPfilters/01-Diplodus/
01-SNPfilters/02-Mullus/
01-SNPfilters/filtering_dip.sh
01-SNPfilters/filtering_mul.sh
04-finalPrep/01-Diplodus/
04-finalPrep/02-Mullus/
......@@ -67,7 +67,7 @@ awk '/#/' DP3g95maf05.recode.vcf
# population level: vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95p5maf05.recode.vcf > DP3g95p5maf05.fil1.vcf
vcffilter -s -f "AB > 0.25 & AB < 0.75 | AB < 0.01" DP3g95maf05.recode.vcf > DP3g95maf05.fil1.vcf
# check how many were removed
awk '!/#/' DP3g95maf05.recode.vcf | wc -l
DP3g95maf05.recode.vcf | wc -l
awk '!/#/' DP3g95maf05.fil1.vcf | wc -l
# step 6 : filter out sites that have reads from both strands
......@@ -129,7 +129,21 @@ vcftools --vcf DP3g95maf05.fil5.vcf --recode-INFO-all --out DP3g95maf05.FIL --m
Rscript --vanilla ../filterstep_Ho6_Fis5.R DP3g95maf05.FIL.recode.vcf positions_HoFis_$SPCODE.txt
vcftools --vcf DP3g95maf05.FIL.recode.vcf --positions positions_HoFis_$SPCODE.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis
# step 12 : remove Individuals with extreme-outlier observed heterozygosity values
# 1- calculate individual homozygosity with vcftools
# 2- calculate individual heterozygosity in R
# 3- inspect values and keep only individuals with non-extreme values
# extreme values are defined as those that fall outside of 9 times the interquartal range. (typical outliers are defined as 1.5IQR) this was set arbitrarily after visual inspection, and to allows the same criteria to be applied to both species
# calculate individual homozygosity
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --het --out DP3g95maf05.FIL.HFis.indHo
cp DP3g95maf05.FIL.HFis.indHo.het DP3g95maf05.FIL.HFis.indHo.csv
# calculate individual heterozygosity, asses extreme outliers and find individuals to keep
Rscript --vanilla ../filterstep_indHetO_9IQR.R DP3g95maf05.FIL.HFis.indHo.csv indv_HETo_9IQR.txt
# remove individuals
vcftools --vcf DP3g95maf05.FIL.HFis.recode.vcf --keep indv_HETo_9IQR.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.HFis.indHet
# step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.HFis.indHet.recode.vcf "$SPCODE"_all_filtered_origid.vcf
cp DP3g95maf05.FIL.HFis.recode.vcf "$SPCODE"_all_filtered.vcf
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
start <- Sys.time()
print(paste("run started at",start))
# Read homozygosity file
dat <- read.csv(args[1], sep = "\t")
# calculate heterozygosity
dat$O.HET. <- dat$N_SITES - dat$O.HOM.
# in interactive mode: visualise heterozygosity to determine
# - if individuals need to be removed
# - adapted boxplot range
# plot(dat$O.HET.)
# boxplot(dat$O.HET.)
# boxplot(dat$O.HET., range = 9)
# extract non-extreme individuals
OutVals <- boxplot(dat$O.HET., range = 9)$out
OutVals.ind <- as.character(dat$INDV[which(dat$O.HET. %in% OutVals)])
print(paste("# Individuals to remove =", length(OutVals.ind)))
print(cbind(OutVals.ind, OutVals))
indToKeep <- setdiff(as.character(dat$INDV),OutVals.ind)
print(paste("# Individuals to keep =", length(indToKeep)))
# output individuals list
write.table(indToKeep, args[2], sep = "\t", quote = F, row.names = F, col.names = F)
# duration of script run
end <- Sys.time()
print(paste("run ended at", end))
end - start
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# read id file
id <- read.table(paste0("id_",args,".txt"), stringsAsFactors=F)
# correct names
if(args[1] == "mul") {
# mullus
cell <- gsub("_.*","", id$V1)
cell <- paste0("C", cell)
ind <- gsub(".*_","", id$V1)
ind <- gsub("E.*", "", ind)
newid <- paste0(cell,"i", ind)
} else {
# dip
cell <- gsub("_.*","", id$V1)
ind <- gsub(".*i","", id$V1)
ind <- gsub("E.*", "", ind)
newid <- paste0(cell, "i", ind)
}
# check
bothid <- cbind(id, newid)
print(bothid)
# export
#write.table(newid, file = paste0("new_id_", args,".txt"), quote = F, row.names = F, col.names = F)
write.table(bothid, file = paste0("new_id_", args,".txt"), sep = " ", quote = F, row.names = F, col.names = F)
# create popmap
popmap <- cbind(newid, cell)
colnames(popmap) <- c("INDIVIDUALS", "STRATA")
print(popmap)
nind <- nrow(popmap)
write.table(popmap, file = paste0(args,"_population_map_",nind,"ind.txt"), sep = "\t", quote = F, row.names = F)
# define arguments
FOLDER=$1
SPCODE=$2
cd $FOLDER
# extract sample / individual IDs
bcftools query -l "$SPCODE"_all_filtered_origid.vcf > id_"$SPCODE".txt
# rename in R and create population map (necessary for PGDSpider conversions later on)
Rscript --vanilla ../rename_id.R "$SPCODE"
# replace sample names in vcf file
bcftools reheader "$SPCODE"_all_filtered_origid.vcf --samples new_id_"$SPCODE".txt > "$SPCODE"_all_filtered.vcf
\ No newline at end of file
......@@ -8,11 +8,11 @@ library(coda)
library(vcfR)
# set arguments
spCode <- "dip" # species
spCode <- "mul" # species
runNum <- 1 # run number
inputName <- paste0(spCode,"_bayesc_2pop_run",runNum) # 2pop or 3pop analysis (only for mullus)
#vcfPath <- "../01-SNPfilters/02-Mullus/mul_all_filtered.vcf"
vcfPath <- "../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf"
#vcfPath <- "../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf"
## model convergence ####
chain1 <- read.table(paste0(inputName,"_ch1.sel"),header=TRUE)
......@@ -53,11 +53,11 @@ gelman.plot(combined,ask)
## Bayescan results ####
## Functions defined in the script 'plot_R.r'in the BayeScan folder
results<-plot_bayescan(paste0(inputName,"_ch1_fst.txt"),FDR=0.05)
results<-plot_bayescan(paste0(inputName,"_ch2_fst.txt"),FDR=0.05)
# detect outliers
outliers_bayescan <-results$outliers # position of outliers
results$nb_outliers #68 outliers for mullus, 126 for diplodus
results$nb_outliers #283 outliers for mullus, 126 for diplodus
# extract corresponding SNP names and positions
vcf <- read.vcfR(vcfPath, verbose=F)
......@@ -65,7 +65,7 @@ loci <- as.data.frame(vcf@fix[,1:2])
outlier_loci <- loci[results$outliers,]
# output positions table
write.table(outlier_loci, file = paste0("outl_pos_bayesc_",spCode,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_bayesc_", spCode,".txt"), quote = 0)
write.table(outlier_loci, file = paste0("outl_pos_bayesc_",spCode,"_run",runNum,".txt"), sep = "\t", quote = F, row.names = F, col.names = F)
print(paste0("outlier positions table exported to outl_pos_bayesc_", spCode,"_run", runNum,".txt"), quote = 0)
#----- BayeScan -----#
## produce Bayescan input file
# setwd("~/Documents/project_SEACONNECT/seaConnect--dataPrep/02-Bayescan/")
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
......@@ -16,7 +17,7 @@ library(radiator)
# define arguments
inputVcf <- args[1]
speciesCode <- args[2]
outputBayeScan <- paste0(speciesCode,"_bayesc_input_3pop")
outputBayeScan <- paste0(speciesCode,"_bayesc_input_2pop")
# import vcf file to genind
vcfFile <- read.vcfR(inputVcf, verbose = F)
......@@ -25,14 +26,15 @@ genind <- vcfR2genind(vcfFile)
!is.null(genind)
# use DAPC groups as pop
grp <- find.clusters(genind, max.n.clust = 40)
400
grp <- find.clusters(genind, max.n.clust = 40, n.pca = round(nrow(genind$tab)/3))
2
grp$size
pop(genind) <- grp$grp
pop(genind) <- paste0("pop", pop(genind))
# convert to bayescan format
genomic_converter(genind, output="bayescan", filename = outputBayeScan)
tidy_genind <- tidy_genomic_data(genind)
write_bayescan(tidy_genind, pop.select = levels(pop(genind)), filename = outputBayeScan)
end <- Sys.time()
print(paste("run ended at", end))
......
......@@ -2,25 +2,26 @@
# first run, two chains per run
# PCAdapt detected between two and three populations for mullus so we run the model for both options
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# for diplodus: 2 populations detected
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch1 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run1_ch2 -n 5000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
# second run, two chains per run
# here we try with a higher prior odds
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 10000
# third run
......@@ -29,8 +30,8 @@ BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run2_ch2 -n
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_2pop.txt -o mul_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
\ No newline at end of file
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits mul_bayesc_input_3pop.txt -o mul_bayesc_3pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch1 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
#BayeScan2.1_macos64bits dip_bayesc_input_2pop.txt -o dip_bayesc_2pop_run3_ch2 -n 10000 -thin 10 -nbp 20 -pilot 5000 -burn 50000 -pr_odds 100
\ No newline at end of file
......@@ -2,6 +2,7 @@
# vignette: https://cran.r-project.org/web/packages/pcadapt/vignettes/pcadapt.html
#!/usr/bin/env Rscript
#setwd("/Users/eboulanger-admin/Documents/project_SEACONNECT/seaConnect--dataPrep/03-PCAdapt")
args = commandArgs(trailingOnly=TRUE)
#args <- c("../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf", "dip")
#args <- c("../01-SNPfilters/02-Mullus/mul_all_filtered.vcf", "mul")
......
......@@ -4,12 +4,13 @@
inputFile=$1
spCode=$2
runNum=$3
#### adaptive SNPs ####
## step 1: create a final list of all outlier loci positions
cat ../02-Bayescan/outl_pos_bayesc_"$spCode".txt ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt > "$spCode"_outl_pos_combi.txt
cat ../02-Bayescan/outl_pos_bayesc_"$spCode"_"$runNum".txt ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt > "$spCode"_outl_pos_combi.txt
# how many are detected by both methods?
nPosOutDup=`sort "$spCode"_outl_pos_combi.txt | uniq -d | wc -l`
......@@ -33,15 +34,17 @@ mv "$spCode"_adaptive_"$nPosOut".recode.vcf "$spCode"_adaptive_"$nPosOut".vcf
## step 3 : convert vcf files to necessary formats for downstream analyses
# convert .vcf files to .bep files for ADMIXTURE
# .vcf to .tped
vcftools --vcf "$spCode"_adaptive_"$nPosOut".vcf --plink-tped --out "$spCode"_adaptive_"$nPosOut"
# .tped to .bed
plink --tped "$spCode"_adaptive_"$nPosOut".tped --tfam "$spCode"_adaptive_"$nPosOut".tfam --make-bed --out "$spCode"_adaptive_"$nPosOut" --noweb
# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$spCode"_adaptive_"$nPosOut".vcf --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr --vcf-half-call missing
# convert to minor allele frequencies for RDA and other analyses
# .tped to .raw
plink --tped "$spCode"_adaptive_"$nPosOut".tped --tfam "$spCode"_adaptive_"$nPosOut".tfam --recodeA --out "$spCode"_adaptive_"$nPosOut" --noweb
# writes .raw file
plink --bfile "$spCode"_adaptive_"$nPosOut" --recodeA --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr
# convert to STRUCTURE format for assignPop
# writes .strct_in file
plink --bfile "$spCode"_adaptive_"$nPosOut" --recode structure --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr
#### neutral SNPs ####
......@@ -64,12 +67,36 @@ mv "$spCode"_neutral.log "$spCode"_neutral_"$nPosNtrl".log
## step 4 : convert vcf files to necessary formats for downstream analyses
# convert .vcf files to .bep files for ADMIXTURE
# .vcf to .tped
vcftools --vcf "$spCode"_neutral_"$nPosNtrl".vcf --plink-tped --out "$spCode"_neutral_"$nPosNtrl"
# .tped to .bed
plink --tped "$spCode"_neutral_"$nPosNtrl".tped --tfam "$spCode"_neutral_"$nPosNtrl".tfam --make-bed --out "$spCode"_neutral_"$nPosNtrl" --noweb
# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$spCode"_neutral_"$nPosNtrl".vcf --out "$spCode"_neutral_"$nPosNtrl" --allow-extra-chr --vcf-half-call missing
# convert to minor allele frequencies for RDA and other analyses
# writes .raw file
plink --bfile "$spCode"_neutral_"$nPosNtrl" --recodeA --out "$spCode"_neutral_"$nPosNtrl" --allow-extra-chr
# convert to STRUCTURE format for assignPop
# writes .strct_in file
plink --bfile "$spCode"_neutral_"$nPosNtrl" --recode structure --out "$spCode"_neutral_"$nPosNtrl" --allow-extra-chr
# convert to genepop format for use of GENODIVE (to calculate kinship)
# use PGDSpider GUI
# input: neutral.vcf and population map, output: neutral.gen.txt
# outputted .gen.txt file: add information on first line (otherwise genodive won't recognise format)
#### all SNPs ####
## convert vcf files to necessary formats for downstream analyses
# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$inputFile" --out "$spCode"_all_filtered --allow-extra-chr --vcf-half-call missing
# convert to minor allele frequencies for RDA and other analyses
# .tped to .raw
plink --tped "$spCode"_neutral_"$nPosNtrl".tped --tfam "$spCode"_neutral_"$nPosNtrl".tfam --recodeA --out "$spCode"_neutral_"$nPosNtrl" --noweb
# writes .raw file
plink --bfile "$spCode"_all_filtered --recodeA --out "$spCode"_all_filtered --allow-extra-chr
# convert to STRUCTURE format for assignPop
# writes .strct_in file
plink --bfile "$spCode"_all_filtered --recode structure --out "$spCode"_all_filtered --allow-extra-chr
......@@ -9,7 +9,7 @@ Scripts to prepare RADSeq data for analysis.
You will need to install the following software:
- [VCFtools](https://vcftools.github.io)
- [BCFtools](https://samtools.github.io/bcftools/)
- [PLINK](http://zzz.bwh.harvard.edu/plink/)
- [PLINK v1.9](https://www.cog-genomics.org/plink/1.9/)
You will need to have the following R packages:
......@@ -51,7 +51,7 @@ bash filtering.sh ../00-rawData/02-Mullus/mullus.vcf 02-Mullus mul
| step 10 | depth x quality score cutoff | 424 | 15466 | |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 424 | 15232 | 25 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | 413 | 15232 | 23.00 | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | mul_all_filtered.vcf
| step 13 | rename | | | | mul_all_filtered_origid.vcf
### SNP filtering results for Diplodus sargus
......@@ -69,12 +69,20 @@ bash filtering.sh ../00-rawData/02-Mullus/mullus.vcf 02-Mullus mul
| step 9 | remove sites quality score < 1/4 depth | 297 | 9688 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8325 | 11.00 |
| step 11 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | 8206 | 27 min | DP3g95maf05.FIL.HFis.recode.vcf
| step 12 | remove extreme outliers individual O HET | **to do** | | | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | dip_all_filtered.vcf
| step 12 | remove extreme outliers individual O HET | 297 | | | DP3g95maf05.FIL.HFis.indHet.recode.vcf
| step 13 | rename | | | | dip_all_filtered_origid.vcf
### manually rename individuals for conventional naming system
**to do**
### rename individuals for conventional naming system and create population map
this is adapted for our case, review R script or skip according to naming needs
```
bash renaming.sh 01-Diplodus/ dip
bash renaming.sh 02-Mullus/ mul
```
output:
- dip_all_filtered.vcf
- dip_popualtion_map_297ind.txt
- mul_all_filtered.vcf
- mul_population_map_413ind.txt
## 02-Bayescan
......@@ -140,7 +148,7 @@ see how many outliers were detected :
```
wc -l outl_pos_pcadpt_mul.txt
```
2632 outliers detected for mullus
1327 outliers detected for mullus
#### for diplodus :
```
......@@ -159,30 +167,46 @@ in 02-BayeScan and 03-PCAdapt.
It also subsets the same vcf file for the remaining neutral positions and applies a final
filter for HWE.
Finally, the script converts the final adaptive and neutral .vcf files in .tped, .tfam,
.bed and .raw format necessary for downstream analyses.r
Finally, the script converts the final adaptive and neutral .vcf files in .bed .bim .fam .raw
and .strct_in format necessary for downstream analyses.
The conversion to genepop format for use of GENODIVE (to calculate kinship) is done with
the PGDSpider GUI.
input: neutral.vcf and population map, output: neutral.gen.txt
outputted .gen.txt file: add information on first line (otherwise genodive won't recognise format)
set arguments:
To run the script, set arguments:
$1 = input file (vcf)
$2 = species code
$3 = bayescan run index
#### for diplodus
```
bash outlier_positions.sh ../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf dip run1
mkdir 01-Diplodus
cp ../01-SNPfilters/01-Diplodus/dip_population_map_*.txt 01-Diplodus/
cp ../01-SNPfilters/01-Diplodus/dip_all_filtered.vcf 01-Diplodus/
bash outlier_positions.sh 01-Diplodus/dip_all_filtered.vcf dip run1
```
In total, 494 outlier loci were detected, with 10 loci detected by both the BayeScan and PCAdapt method.
After HWE filter, 7570 neutral loci were retained.
#### for Mullus
```
bash outlier_positions.sh ../01-SNPfilters/02-Mullus/mul_all_filtered.vcf mul run1
mkdir 02-Mullus
cp ../01-SNPfilters/02-Mullus/mul_population_map_*.txt 02-Mullus/
cp ../01-SNPfilters/02-Mullus/mul_all_filtered.vcf 02-Mullus/
bash outlier_positions.sh 02-Mullus/mul_all_filtered.vcf mul run1
```
In total, 2680 adaptive loci were detected, with 10 loci detected by both the BayeScan and PCAdapt method.
After HWE filter, 12432 neutral loci were retained.
#### clean up files to separate directories
```
mkdir 01-Diplodus
mv dip_* 01-Diplodus/
mv mul_* 02-Mullus/
```
mkdir 02-Mullus
mv mul_* 02-Mullus
```
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment