filterstep_indHetO_9IQR.R 1.56 KB
Newer Older
eboulanger's avatar
eboulanger committed
1
2
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
eboulanger's avatar
eboulanger committed
3
library(gridExtra)
eboulanger's avatar
eboulanger committed
4
5
6
7
8
9

start <- Sys.time()
print(paste("run started at",start))

# Read homozygosity file
dat <- read.csv(args[1], sep = "\t")
eboulanger's avatar
eboulanger committed
10
11
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/02-Mullus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
# dat <- read.csv("~/Documents/project_SEACONNECT/seaConnect--dataPrep/01-SNPfilters/01-Diplodus/DP3g95maf05.FIL.HFis.indHo.csv", sep = "\t")
eboulanger's avatar
eboulanger committed
12
13
14
15
16
17
# calculate heterozygosity
dat$O.HET. <- dat$N_SITES - dat$O.HOM.

# in interactive mode: visualise heterozygosity to determine 
# - if individuals need to be removed
# - adapted boxplot range
eboulanger's avatar
eboulanger committed
18
19
20
21
22
23
24
25
26
27
28
pdf(file="plot_indv_HETo.pdf")
   plot(dat$O.HET., main = "observed heterozygosity per individual")
   boxplot(dat$O.HET., main = "observed heterozygosity per individual")
   boxplot(dat$O.HET., range = 6, main = "o het with interquartal range = 6")

# extract extreme individuals
#OutVals <- boxplot(dat$O.HET., range = 9)$out
OutVals <- boxplot(dat$O.HET., range = 6)$out
plot.new()
grid.table(dat[dat$O.HET. %in% OutVals,])
dev.off()
eboulanger's avatar
eboulanger committed
29
30
31
32
33
34
35
36
37

OutVals.ind <- as.character(dat$INDV[which(dat$O.HET. %in% OutVals)])
print(paste("# Individuals to remove =", length(OutVals.ind)))
print(cbind(OutVals.ind, OutVals))

indToKeep <- setdiff(as.character(dat$INDV),OutVals.ind)
print(paste("# Individuals to keep =", length(indToKeep)))

# output individuals list
eboulanger's avatar
eboulanger committed
38
write.table(indToKeep, args[3], sep = "\t", quote = F, row.names = F, col.names = F)
eboulanger's avatar
eboulanger committed
39
40
41
42
43

# duration of script run
end <- Sys.time()
print(paste("run ended at", end))
end - start