outlier_positions.sh 4.11 KB
Newer Older
eboulanger's avatar
eboulanger committed
1
2
3
4
5
6
7
8
9
## Final steps to get neutral and adaptive SNP set and correct file formats

# define arguments

inputFile=$1
spCode=$2

#### adaptive SNPs ####

eboulanger's avatar
eboulanger committed
10
11
# step 1: rename list of outlier positions
cp ../03-PCAdapt/outl_pos_pcadpt_"$spCode".txt "$spCode"_outl_pos.txt
eboulanger's avatar
eboulanger committed
12
# extract total number of unique outlier positions (for file naming)
eboulanger's avatar
eboulanger committed
13
nPosOut=`sort "$spCode"_outl_pos.txt | uniq -u | wc -l` 
eboulanger's avatar
eboulanger committed
14
15
16
17
18
echo ""$nPosOut" unique outlier loci in total"
# remove leading whitespaces (from here: https://www.cyberciti.biz/faq/bash-remove-whitespace-from-string/)
shopt -s extglob # turn it on
nPosOut="${nPosOut##*( )}"
shopt -u extglob # turn it off
eboulanger's avatar
eboulanger committed
19
20
# add number of outliers to file name
mv "$spCode"_outl_pos.txt "$spCode"_outl_pos_"$nPosOut".txt
eboulanger's avatar
eboulanger committed
21
22
23
24
25
26
27
28

## step 2: subset filtered vcf file by outlier positions

vcftools --vcf "$inputFile" --positions "$spCode"_outl_pos_"$nPosOut".txt --recode --recode-INFO-all --out "$spCode"_adaptive_"$nPosOut"
mv "$spCode"_adaptive_"$nPosOut".recode.vcf "$spCode"_adaptive_"$nPosOut".vcf

## step 3 : convert vcf files to necessary formats for downstream analyses

eboulanger's avatar
eboulanger committed
29
30
31
# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$spCode"_adaptive_"$nPosOut".vcf --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr --vcf-half-call missing
eboulanger's avatar
eboulanger committed
32
33

# convert to minor allele frequencies for RDA and other analyses
eboulanger's avatar
eboulanger committed
34
35
36
37
38
39
# writes .raw file
plink --bfile "$spCode"_adaptive_"$nPosOut" --recodeA --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr

# convert to STRUCTURE format for assignPop 
# writes .strct_in file
plink --bfile "$spCode"_adaptive_"$nPosOut" --recode structure --out "$spCode"_adaptive_"$nPosOut" --allow-extra-chr
eboulanger's avatar
eboulanger committed
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

#### neutral SNPs ####

## step 1: create a final list of all neutral loci positions

# get list of all positions original vcf file
grep -v "^##" "$inputFile" | cut -f1-2 | sed '1d' > "$spCode"_all_pos.txt

# remove previously identified outlier positions to only retain neutral ones
cat "$spCode"_all_pos.txt "$spCode"_outl_pos_"$nPosOut".txt | sort | uniq -u > "$spCode"_ntrl_pos_preHWE.txt

## step 2 : subset filtered vcf file by neutral outlier positions
vcftools --vcf "$inputFile" --positions "$spCode"_ntrl_pos_preHWE.txt --recode --recode-INFO-all --out "$spCode"_neutral_preHWE

## step 3 : apply HWE filter
vcftools --vcf "$spCode"_neutral_preHWE.recode.vcf --hwe 0.000001 --recode --recode-INFO-all --out "$spCode"_neutral
nPosNtrl=`grep  -c -v "^#" "$spCode"_neutral.recode.vcf`
mv "$spCode"_neutral.recode.vcf "$spCode"_neutral_"$nPosNtrl".vcf
mv "$spCode"_neutral.log "$spCode"_neutral_"$nPosNtrl".log

## step 4 : convert vcf files to necessary formats for downstream analyses

eboulanger's avatar
eboulanger committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$spCode"_neutral_"$nPosNtrl".vcf --out "$spCode"_neutral_"$nPosNtrl" --allow-extra-chr --vcf-half-call missing

# convert to minor allele frequencies for RDA and other analyses
# writes .raw file
plink --bfile "$spCode"_neutral_"$nPosNtrl" --recodeA --out "$spCode"_neutral_"$nPosNtrl" --allow-extra-chr

# convert to STRUCTURE format for assignPop 
# writes .strct_in file
plink --bfile "$spCode"_neutral_"$nPosNtrl" --recode structure --out "$spCode"_neutral_"$nPosNtrl"  --allow-extra-chr

# convert to genepop format for use of GENODIVE (to calculate kinship)
# use PGDSpider GUI
# input: neutral.vcf and population map, output: neutral.gen.txt
# outputted .gen.txt file: add information on first line (otherwise genodive won't recognise format)


#### all SNPs ####

## convert vcf files to necessary formats for downstream analyses

# convert .vcf files to PLINK 1 binary files
# writes .bed .bim and .fam files (ADMIXTURE will need .bed file)
plink --vcf "$inputFile" --out "$spCode"_all_filtered --allow-extra-chr --vcf-half-call missing
eboulanger's avatar
eboulanger committed
87
88

# convert to minor allele frequencies for RDA and other analyses
eboulanger's avatar
eboulanger committed
89
90
91
92
93
94
# writes .raw file
plink --bfile "$spCode"_all_filtered --recodeA --out "$spCode"_all_filtered --allow-extra-chr

# convert to STRUCTURE format for assignPop 
# writes .strct_in file
plink --bfile "$spCode"_all_filtered --recode structure --out "$spCode"_all_filtered --allow-extra-chr