Commit ddaf119e authored by eboulanger's avatar eboulanger
Browse files

script update

parent fdb2decf
......@@ -15,6 +15,25 @@
| step 9 | remove sites quality score < 1/4 depth | | 17546 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 424 | 15466 | |
| step 11 | HWE 0.000001 | 424 | 14532 | 26.00 | DP3g95maf05.FIL.hwe.recode.vcf
| step 12 | He | | | |
| step 13 | Fis | | | |
| step 14 | rename | | | | mullus_all_filtered.vcf
\ No newline at end of file
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 424 | | |
| step 13 | rename | | | | mullus_all_filtered.vcf
# SNP filtering results for Diplodus sargus
### ddocent pipeline + additions
| filtering step | filter for | individuals retained | SNPs retained | run time (sec) | output |
| -------------- | ------------------------------------------------- | :------------------: | :-----------: | :------------: | ------ |
| step 0 | ddocent output data | 297 | 13362 | | diplodus.vcf
| step 1 | call below 50%, mac < 3, min quality score = 30 | 297 | 13362 | 14.00 | diplodus.g5mac3.recode.vcf
| step 2 | min mean depth genotype call = 3 | 297 | 13362 | 14.00 | diplodus.g5mac3dp3.recode.vcf
| step 3 | individuals > 50% missing data | 297 | 13362 | 16.00 | diplodus.g5mac3dplm.recode.vcf
| step 4 | sites > 5% missing data, maf 0.05, min meanDP = 5 | 297 | 10342 | 11.00 | DP3g95maf05.recode.vcf
| step 5 | filter for allele balance | 297 | 10155 | | DP3g95maf05.fil1.vcf
| step 6 | filter out sites with reads from both strands | 297 | 9899 | |
| step 7 | ration mapping qualities ref vs alternate alleles | 297 | 9399 | | DP3g95maf05.fil3.vcf
| step 8 | paired status | 297 | 9399 | | DP3g95maf05.fil4.vcf
| step 9 | remove sites quality score < 1/4 depth | 297 | 9398 | | DP3g95maf05.fil5.vcf
| step 10 | depth x quality score cutoff | 297 | 8077 |11.00 |
| step 11 | HWE 0.000001 | 297 | 7761 | 7.00 | DP3g95maf05.FIL.hwe.recode.vcf
| step 12 | He > 0.6 & Fis > 0.5 & Fix < -0.5 | 297 | | |
| step 13 | rename | | | | diplodus_all_filtered.vcf
\ No newline at end of file
......@@ -86,7 +86,7 @@ awk '!/#/' DP3g95maf05.fil4.vcf | wc -l
vcffilter -f "QUAL / DP > 0.25" DP3g95maf05.fil4.vcf > DP3g95maf05.fil5.vcf
awk '!/#/' DP3g95maf05.fil5.vcf | wc -l
# filter 10 : consists of multiple steps
# step 10 : consists of multiple steps
## 1 : create list of depth of each locus
cut -f8 DP3g95maf05.fil5.vcf | grep -oe "DP=[0-9]*" | sed 's/DP=//g' > DP3g95maf05.fil5.DEPTH
## 2 : create list of quality scores
......@@ -118,31 +118,23 @@ set xtics 5
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
## 8 : combine all to get new vcf
vcftools --vcf DP3g95maf05.fil5.vcf --recode-INFO-all --out DP3g95maf05.FIL --max-meanDP 102.5 --exclude-positions DP3g95maf05.fil5.lowQDloci --recode
# > After filtering, kept 297 out of 297 Individuals
# > Outputting VCF file...
# > After filtering, kept 5722 out of a possible 8871 Sites
# > Run Time = 6.00 seconds
# step 10 : HWE
# step 11 : HWE
# recommends by pop but since I have mostly one large pop: apply to all
vcftools --vcf DP3g95maf05.FIL.recode.vcf --hwe 0.000001 --recode --recode-INFO-all --out DP3g95maf05.FIL.hwe
# step 11 : He
# step 12 : remove SNPs for which Ho > 0.6 and Fis < -0.5 or Fis > 0.5
# find to-remove (/ to-keep) SNP positions in R using package Hierfstat
TO DO
# step 12 : Fis
TO DO
# step 3 : outliers
TO DO
Rscript --vanilla ../filterstep_Ho6_Fis5.R DP3g95maf05.FIL.hwe.recode.vcf positions_HoFis_dip.txt
vcftools --vcf DP3g95maf05.FIL.hwe.recode.vcf --positions positions_HoFis_dip.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.hwe.HFis
# step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.hwe.recode.vcf mullus_all_filtered.vcf
# step 14 : outliers
......@@ -6,6 +6,7 @@ cd ~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/
mkdir 02-Mullus
cd 02-Mullus
FILE=../../00-rawData/02-Mullus/mullus.vcf
PATH=$PATH:~/programmes/vcflib/bin/
......@@ -52,7 +53,7 @@ vcftools --vcf mullus.g5mac3dp3.recode.vcf --remove lowDP.indv --recode --recode
# step 4 : restrict data to variants called in a high percentage of individuals and filter by mean depth of genotypes
vcftools --vcf mullus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 5
vcftools --vcf mullus.g5mac3dplm.recode.vcf --max-missing 0.95 --maf 0.05 --recode --recode-INFO-all --out DP3g95maf05 --min-meanDP 10
###
# FreeBayes - specific filters
......@@ -72,8 +73,8 @@ awk '!/#/' DP3g95maf05.fil1.vcf | wc -l
# step 6 : filter out sites that have reads from both strands
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 100" -s DP3g95maf05.fil1.vcf > DP3g95maf05.fil2.vcf
awk '!/#/' DP3g95maf05.fil2.vcf | wc -l
# SKIPPED STEP 6 FOR NOW
# step 7 : look at ration of mapping qualities between reference and alternate alleles
vcffilter -f "MQM / MQMR > 0.9 & MQM / MQMR < 1.05" DP3g95maf05.fil1.vcf > DP3g95maf05.fil3.vcf
awk '!/#/' DP3g95maf05.fil3.vcf | wc -l
......@@ -87,7 +88,7 @@ awk '!/#/' DP3g95maf05.fil4.vcf | wc -l
vcffilter -f "QUAL / DP > 0.25" DP3g95maf05.fil4.vcf > DP3g95maf05.fil5.vcf
awk '!/#/' DP3g95maf05.fil5.vcf | wc -l
# filter 10 : consists of multiple steps
# step 10 : consists of multiple steps
## 1 : create list of depth of each locus
cut -f8 DP3g95maf05.fil5.vcf | grep -oe "DP=[0-9]*" | sed 's/DP=//g' > DP3g95maf05.fil5.DEPTH
## 2 : create list of quality scores
......@@ -119,34 +120,23 @@ set xtics 5
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
## 8 : combine all to get new vcf
vcftools --vcf DP3g95maf05.fil5.vcf --recode-INFO-all --out DP3g95maf05.FIL --max-meanDP 102.5 --exclude-positions DP3g95maf05.fil5.lowQDloci --recode
# > After filtering, kept 297 out of 297 Individuals
# > Outputting VCF file...
# > After filtering, kept 5722 out of a possible 8871 Sites
# > Run Time = 6.00 seconds
# step 10 : HWE
# step 11 : HWE
# recommends by pop but since I have mostly one large pop: apply to all
vcftools --vcf DP3g95maf05.FIL.recode.vcf --hwe 0.000001 --recode --recode-INFO-all --out DP3g95maf05.FIL.hwe
# step 11 : He
TO DO
# step 12 : Fis
TO DO
# step 13 : outliers
TO DO
# step 12 : remove SNPs for which Ho > 0.6 and Fis < -0.5 or Fis > 0.5
# find to-remove (/ to-keep) SNP positions in R using package Hierfstat
# step 14 : rename final filtered dataset
Rscript --vanilla ../filterstep_Ho6_Fis5.R DP3g95maf05.FIL.hwe.recode.vcf positions_HoFis_mul.txt
vcftools --vcf DP3g95maf05.FIL.hwe.recode.vcf --positions positions_HoFis_mul.txt --recode --recode-INFO-all --out DP3g95maf05.FIL.hwe.HFis
cp DP3g95maf05.FIL.hwe.recode.vcf mullus_all_filtered.vcf
# step 13 : rename final filtered dataset
cp DP3g95maf05.FIL.hwe.HFis.recode.vcf mullus_all_filtered.vcf
# step 14 : outliers
......@@ -14,5 +14,5 @@ step 8 paired status 17546 DP3g95maf05.fil4.vcf
step 9 remove sites quality score < 1/4 depth DP3g95maf05.fil5.vcf
step 10 depth x quality score cutoff
step 11 HWE 0.000001
step 12 He
step 13 Fis
\ No newline at end of file
step 12 He & Fis 5063 191 DP3g95maf05.FIL.hwe.Hfis.recode.vcf
step 13 rename mullus_all_filtered.vcf
\ No newline at end of file
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# load libraries
library(adegenet)
library(polysat)
library(pegas)
library(vcfR)
library(hierfstat)
start <- Sys.time()
print(paste("run started at",start))
# Read vcf
dat <- read.vcfR(args[1], verbose=F)
# Convert to genind format
genind <- vcfR2genind(dat)
# Assign a sampling site to each indiv
ind <- rownames(genind@tab)
pop <-sub("(.*?)_.*", "\\1", ind)
pop(genind) <- pop
#### Filter SNPs
hf <- genind2hierfstat(genind)
stats <- basic.stats(hf)
# Filter SNPs for Ho and Fis
loc_H.6<- stats$perloc[which(stats$perloc$Ho<0.6),]
loc_H.6_Fis.5 <- loc_H.6[which(loc_H.6$Fis<0.5 & loc_H.6>-0.5),]
locnames<-rownames(loc_H.6_Fis.5)
print(paste("# SNPs to keep =", length(locnames)))
# extract loci to remove from the dataset
toremove <-which(locNames(genind) %in% locnames ==F)
print(paste("# SNPs to remove =", length(toremove)))
genind_filter<-genind[loc=-toremove]
posi_id <- levels(genind_filter@loc.fac)
# get positions of loci to keep
# needed to subset vcf file later
positions <- data.frame(row.names = 1:length(posi_id))
for (i in 1:length(posi_id)) {
positions[i,1] <- paste(strsplit(posi_id, "_")[[i]][1:2], collapse = "_")
}
for (i in 1:length(posi_id)) {
positions[i,2] <- strsplit(posi_id, "_")[[i]][3]
}
# output positions table
write.table(positions, file = args[2], sep = "\t", quote = F, row.names = F, col.names = F)
# duration of script run
end <- Sys.time()
print(paste("run ended at", end))
end - start
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment