Commit b453f3e1 authored by peguerin's avatar peguerin
Browse files

Initial commit

parents
# exclude everything
02-raw_data/*tsv
02-raw_data/*txt
03-filtered_data/*
04-species_seq/*
05-species_alnt/*
06-species_alnt_cluster/*
07-master_matrices/*
08-genetic_diversity/*
10-figures/*pdf
brouillon/
# exception to the rule
!somefolder/.gitkeep
!02-raw_data/.gitkeep
!03-filtered_data/.gitkeep
!04-species_seq/.gitkeep
!05-species_alnt/.gitkeep
!06-species_alnt_cluster/.gitkeep
!07-master_matrices/.gitkeep
!08-genetic_diversity/.gitkeep
!10-figures/.gitkeep
#convert raw data into tsv format
sed 's/ /_/g' ./02-raw_data/seqbold_data.txt > ./02-raw_data/seqbold_data.tsv
#remove samples with no lat/lon information
awk '{ if( $34 != "_" && $35 != "_") print $0 }' ./02-raw_data/seqbold_data.tsv > ./03-filtered_data/ll_seqbold_data.tsv
#remove samples with no species name
awk '{ if($21 != "_") print $0 }' ./03-filtered_data/ll_seqbold_data.tsv > ./03-filtered_data/sll_seqbold_data.tsv
#remove samples with no sequences or sequences with IUAPC ambiguity i.e. polybase characters (e.g. "N")
awk '{ test= match( $45,"[RWSNYMKHBVD_]"); if( test ==0) print $0}' ./03-filtered_data/sll_seqbold_data.tsv > ./03-filtered_data/ssll_seqbold_data.tsv
#get information lat/lon with geonames for data with missing lat/lon information but region
./00-scripts/step1/get_geonames_coordinates.sh
#merge geonames-georeferenced sequences with already georeferenced sequences
cat ./03-filtered_data/new_geo_actinopterygii_sl ./03-filtered_data/ssll_seqbold_data.tsv > ./03-filtered_data/total_ssll_seqbold.tsv
#get locus name for each sample
awk '{if(NR>1)print}' ./03-filtered_data/total_ssll_seqbold.tsv | while read ligne;
do
process_id=`echo $ligne | awk '{print $1}'`
requete="http://www.boldsystems.org/index.php/Public_RecordView?processid="$process_id
#echo $process_id
locus_id=`lynx -dump $requete | grep Locus:`
echo $process_id" "$locus_id
done > ./03-filtered_data/locushead
#get sample's ID with a locus which is not "Cytochrome Oxidase Subunit 1 5' Region"
cat ./03-filtered_data/locushead | while read ligne;
do
locus_name=`echo $ligne | cut -f 3- -d " "`
locus_ID=`echo $ligne | cut -f 1 -d " "`
#echo $locus_name
if [ "$locus_name" != "Cytochrome Oxidase Subunit 1 5' Region" ]
then
echo $locus_ID
fi
done > ./03-filtered_data/wronglocusID
#remove samples with locus sequence that is not "Cytochrome Oxidase Subunit 1 5' Region"
grep -avf ./03-filtered_data/wronglocusID ./03-filtered_data/total_ssll_seqbold.tsv > ./03-filtered_data/co1_ssll_seqbold_data.tsv
#!/bin/sh
#get samples with no lat/lon information but region name
awk '{if ( ($34=="_" || $35=="_") && ($40!="_" )) print $0}' ./03-filtered_data/seqbold_data.tsv > ./03-filtered_data/no_geo_region
#sed -e 's/[\,.:;$~)(=*%?!-]//g'
cat ./03-filtered_data/no_geo_region | while read ligne;
do
deb_ligne=`echo $ligne | cut -f 1-33 -d " "`
fin_ligne=`echo $ligne | cut -f 36- -d " "`
recherche=`echo $ligne | awk '{print $38_$39_$40}' | sed 's/_/+/g' | sed -e 's/[\,.:;$~)(=*%?!-]/+/g' | sed -e 's/\([^[:blank:]]\)\([[:upper:]]\)/\1+\2/g' | sed -e 's/+$//g'`
requete="http://www.geonames.org/search.html?q="$recherche
#echo $requete
result=`lynx -dump $requete | grep "Lat/Lng" | head -1 | awk '{ print $3"\t"$5}' | sed 's/\[10\]//g'`
#echo $result
if [[ -z $result ]]
then
#echo "zut"
lat=`lynx -dump $requetelynx -dump $requete | egrep -o "[SN].[0-9]+°.[0-9]+'.[0-9]+''" | head -1 `
#echo $lat
lng=`lynx -dump $requetelynx -dump $requete | egrep -o "[WE].[0-9]+°.[0-9]+'.[0-9]+''" | head -1 `
#echo $lng
if [[ ( -z $lat ) || ( -z $lng ) ]]
then
:
else
result=`python2 00-scripts/step1/lat_long_DMS_DD_converter.py -dms "$lat $lng"`
echo -e "$deb_ligne $result $fin_ligne"
fi
else
result=`echo $result | sed -e 's/\t/ /g' `
echo -e "$deb_ligne $result $fin_ligne"
fi
done > ./03-filtered_data/new_geo_actinopterygii
#remove entries with no species name
awk '{ if($21 != "_") print $0 }' ./03-filtered_data/new_geo_actinopterygii > ./03-filtered_data/new_geo_actinopterygii_s
#remove samples with no sequences or remove sequences with IUAPC ambiguity i.e. polybase characters (e.g. "N")
awk '{ test= match( $45,"[RWSNYMKHBVD_]"); if( test ==0) print $0}' ./03-filtered_data/new_geo_actinopterygii_s > ./03-filtered_data/new_geo_actinopterygii_sl
# -*- coding: utf-8 -*-
# ================================================================================
# NOTICE
# ================================================================================
# convertir des coordonnées latitude longitude en DMS ou en DD
# ================================================================================
# MODULES
# ================================================================================
import re
import argparse
# ================================================================================
# FUNCTIONS
# ================================================================================
def dms2dd(degrees, minutes, seconds, direction):
dd = round(float(degrees) + float(minutes)/60 + float(seconds)/(60*60),3);
if direction == 'S' or direction == 'W':
dd *= -1
return dd;
def dd2dms(deg):
d = int(deg)
md = abs(deg - d) * 60
m = int(md)
sd = (md - m) * 60
return [d, m, sd]
def parse_dms(dms):
#format : N 45° 24' 5'' W 73° 48' 50''
#dms = dms.replace("''","' ")
parts = re.split('[^\d\w]+', dms)
#print parts
lat = dms2dd(parts[1], parts[2], parts[3], parts[0])
lng = dms2dd(parts[5], parts[6], parts[7], parts[4])
return (lat, lng)
'''
dd = parse_dms("36°57'9' N 110°4'21' W")
print(dd)
print(dd2dms(dd[0]))
'''
#================================================================================
#ARGUMENTS
#================================================================================
parser = argparse.ArgumentParser(description='remove AAA* suffix from fasta file')
parser.add_argument("-dms","--dms_coo",type=str)
parser.add_argument("-dd","--dd_coo", type=str)
args = parser.parse_args()
dms_coo = args.dms_coo
dd_coo = args.dd_coo
#================================================================================
#MAIN
#================================================================================
#dans le cas ou on donne les coos en DMS
dd_converted_coo = parse_dms(dms_coo)
print "{0} {1}".format(dd_converted_coo[0],dd_converted_coo[1])
#print(dms2dd(dd_converted_coo[0]))
#cluster .coords and .fasta files regarding the freshwater or marine habitat of the species
for ls_species in `ls ./05-species_alnt/*fasta`;
do
species_name=`basename $ls_species .fasta`
troncated_species=`basename $ls_species .fasta | sed 's/_/ /g' | awk '{ print $1}'`;
if grep "$troncated_species" ./01-infos/marine_actinopterygii_species.txt;
#marine
then
absolutepathspeciesfasta=`readlink -f ./05-species_alnt/$species_name".fasta"`;
ln -s $absolutepathspeciesfasta ./06-species_alnt_cluster/marine/$species_name".fasta";
absolutepathspeciescoords=`readlink -f ./05-species_alnt/$species_name".coords"`;
ln -s $absolutepathspeciescoords ./06-species_alnt_cluster/marine/$species_name".coords";
#freshwater
else
absolutepathspeciesfasta=`readlink -f ./05-species_alnt/$species_name".fasta"`;
ln -s $absolutepathspeciesfasta ./06-species_alnt_cluster/freshwater/$species_name".fasta";
absolutepathspeciescoords=`readlink -f ./05-species_alnt/$species_name".coords"`;
ln -s $absolutepathspeciescoords ./06-species_alnt_cluster/freshwater/$species_name".coords";
fi
ln -s $absolutepathspeciesfasta ./06-species_alnt_cluster/total/$species_name".fasta";
ln -s $absolutepathspeciescoords ./06-species_alnt_cluster/total/$species_name".coords";
done
##
## Get grid equal area ID in which a point (x,y) is located
##
## Author : Pierre-Edouard Guerin
## Montpellier, 2017
##
library(rgeos)
library(rgdal)
#list of the cluster of species ".coords" files on which we want to get "equal area ID" of samples geo-position
IN_FILES = c("./06-species_alnt_cluster/total","./06-species_alnt_cluster/freshwater","./06-species_alnt_cluster/marine")
species_to_remove=c()
# convert ".coords files" of a species into "grid-equalarea ID" files
getEqualAreaSpecies <- function(nom_f){
test = read.table(nom_f,header=F)
colnames(test)=c("lat","lon")
test2 = cbind(test$lon,test$lat)
testS = SpatialPoints(test2)
proj4string(testS) <- CRS('+init=epsg:4326')
SS = spTransform(testS, proj4string(grid))
coorS = SS@coords
is_into=apply(coorS,1,function(x) grid@data$IDcell[which(grid@data$right > x[1] & grid@data$left < x[1] & grid@data$bottom < x[2] & grid@data$top > x[2])])
if(length(is_into)!=0) {
#nouvelle stat coords
final_coords= na.omit(cbind(test$lon,test$lat,as.integer(is_into)))
new_nom_f=gsub("coords","equalareacoords",nom_f)
write.table(final_coords,new_nom_f,col.names=F,row.names=F,sep="\t")
}
else{
print(nom_f)
file.remove(gsub("coords","fasta",nom_f))
file.remove(nom_f)
}
}
# apply getEqualAreaSpecies to each species in a cluster of species
equalAreaCluster <- function(liste_in_files){
coords_files = Sys.glob(paste(liste_in_files,"/*.coords",sep=""))
lapply(coords_files, function(x) getEqualAreaSpecies(x))
}
### load shapefile grid
grid <- readOGR("./01-infos/grid_equalarea200km","gridFish.b_260418")
grid$Id <- 1:nrow(grid@data)
### apply getEqualAreaSpecies to each cluster of species
lapply(IN_FILES, function (x) equalAreaCluster(x))
#================================================================================
#NOTICE
#================================================================================
'''
create 2 files from the database file co1_ssll.tsv
for each specie i :
specie_i.coords
specie_i.fasta
specie_i.coords description :
(indv 1) lat lon
(indv 2) lat lon
...
specie_i.fasta description :
(indv 1) aligned_seq1
( indv 2) aligned_seq2
...
aligned_seq description :
alignment produced by muscle3
FORMAT: 0 1 2 3 4 where 0=gap 1=A 2=C 3=G 4=T
'''
#================================================================================
#MODULES
#================================================================================
import re
import sys
import subprocess
import argparse
import os.path
#================================================================================
#CLASS
#================================================================================
class Individu:
def __init__(self,lat,lon,seq):
self.lat = lat
self.lon = lon
self.seq = seq
class Species:
def __init__(self,name):
self.name = name
self.listOfIndv = []
self.counter = 0
def indv_append(self,lat,lon,seq):
self.listOfIndv.append(Individu(lat,lon,seq))
#================================================================================
#FUNCTIONS
#================================================================================
#================================================================================
#ARGUMENTS
#================================================================================
parser = argparse.ArgumentParser(description='specie seq geo')
parser.add_argument("-o","--output", type=str)
parser.add_argument("-f","--inputFile",type=str)
#================================================================================
#MAIN
#================================================================================
args = parser.parse_args()
resultsPath = args.output
inputFile = args.inputFile
dicOfSpecies={}
#read input file and create species and indv objects
with open(inputFile,'r') as readFile:
for ligne in readFile.readlines()[1:]:
ligneSplit=ligne.split()
speciesName=ligneSplit[20]
indvLat=ligneSplit[33]
indvLon=ligneSplit[34]
indvSeq=ligneSplit[44]
if speciesName not in dicOfSpecies:
dicOfSpecies[speciesName] = Species(speciesName)
dicOfSpecies[speciesName].indv_append(indvLat,indvLon,indvSeq)
dicOfSpecies[speciesName].counter+=1
readFile.close()
#write species.coords and species.fasta files into resultsPath
for key, val in dicOfSpecies.items():
if val.counter > 1:
with open("{0}/{1}.coords".format(resultsPath, key),'w') as coordsFile:
coordsText=""
for indv in val.listOfIndv:
coordsText+="{0}\t{1}\n".format(indv.lat,indv.lon)
coordsFile.write(coordsText)
coordsFile.close()
with open("{0}/{1}.fasta".format(resultsPath, key),'w') as fastaFile:
fastaText=""
id_indv=1
for indv in val.listOfIndv:
fastaText+=">{0}\n".format(id_indv)
fastaText+="{0}\n".format(indv.seq)
id_indv+=1
fastaFile.write(fastaText)
fastaFile.close()
else:
print val.name, val.listOfIndv, "only one indiv"
#fin du script
#directory for .coords and .fasta files
#04-species_seq
#write .coords and .fasta files from CO1 georeferenced sequences file into 04-species_seq
python2 ./00-scripts/step2/fasta_coords_files_species_generator.py -o ./04-species_seq/ -f ./03-filtered_data/co1_ssll_seqbold_data.tsv
#directory for muscle3 intra-species sequences alignment results
#05-species_alnt
#align raw fasta sequences of the same species with MUSCLE3 then convert it into ODD-format fasta
for species_path in `ls ./04-species_seq/*fasta`;
do
species_name=`basename $species_path .fasta | tr -d "."`;
echo $species_name;
muscle3 -in $species_path -out ./05-species_alnt/$species_name.tmp;
sed -e 's/A/1/g' -e 's/C/2/'g -e 's/G/3/g' -e 's/T/4/g' -e 's/-/0/g' -e 's/>[0-9]\+/J/g' ./05-species_alnt/$species_name.tmp | tr -d '\n' | tr 'J' '\n' | tail -n +2 | sed -e 's/./& /g' -e 's/[ \t]$//g' > ./05-species_alnt/$species_name.fasta;
printf "\n" >> ./05-species_alnt/$species_name.fasta;
done
rm ./05-species_alnt/*.tmp
#symbolic links from coords files to 05-species_alnt
for filecoords in `ls ./04-species_seq/*.coords`;
do
linknamefilecoords=`basename $filecoords .coords | tr -d "."`;
absolutepathfilecoords=`readlink -f $filecoords`;
ln -s $absolutepathfilecoords ./05-species_alnt/$linknamefilecoords".coords";
done
########################################################################
# Codes for the paper:
# An Anthropocene Map of Genetic Diversity
#
# Andreia Miraldo, Sen Li, Michael K. Borregaard, Alexander Floréz-Rodriguéz, Shyam Gopalakrishnan, Mirneza Risvanovic, Zhiheng Wang, Carsten Rahbek, Katharine A. Marske & David Nogués-Bravo
#
# Submitted to Science, 2016
# Code in this file by Sen Li and Michael K. Borregaard
#########################################################################
# load the necessary library
using DataFrames
"""
A function to compute the GD (Genetic Diversity) value from a set of sequences. The algorithm compares all pairwise combinations of sequences and calculates the proportion of loci that differ between the pair. Returns a DataFrame with the identity of sequences, the lengths of each sequence, the overlap, and the computed pairwise divergence values.
**Parameters**
* 'species_seqs': A matrix where the rows are aligned genetic sequences, and columns are loci. Basepairs must be coded as 1, 2, 3 or 4, or with a 0 signifying that the locus is absent from the alignment.
"""
function compare_pairwise(species_seqs::Matrix{Int})
num_seqs = size(species_seqs,1) # count the number of sequences
if(num_seqs < 2) # return an empty state if there is only one sequence
return(DataFrame(seq1 = -1, seq2 = -1, length_seq1 = -1, length_seq2 = -1, overlap = -1., commons = -1, num_per_bp = -1.))
end
#Preinitialize containers
seq1 = Int[]
seq2 = Int[]
length_seq1 = Int[]
length_seq2 = Int[]
overlap = Float64[]
commons = Int[]
num_per_bp = Float64[]
for iter = 1:(num_seqs-1)
for iter_sub = (iter + 1):num_seqs # For each pairwise combination of sequences
pair_seqs = species_seqs[[iter, iter_sub], :]
common_sites = minimum(pair_seqs, 1) .> 0 #Ensure that only sites existing in both sequences are compared (i.e. ignore loci missing from the alignment)
if sum(common_sites) == 0.
push!(num_per_bp, 0.)
else
num_segregating = sum(diff(pair_seqs[:, vec(common_sites)], 1) .!= 0)
push!(num_per_bp, num_segregating/sum(common_sites))
end
push!(seq1, iter)
push!(seq2, iter_sub)
ls1 = sum(species_seqs[iter, :] .!= 0)
ls2 = sum(species_seqs[iter_sub, :] .!= 0)
push!(length_seq1, ls1)
push!(length_seq2, ls2)
push!(overlap, sum(common_sites)/max(ls1, ls2)) #Compute the overlap between the sequences
push!(commons, sum(common_sites))
end
end
DataFrame(seq1 = seq1, seq2 = seq2, length_seq1 = length_seq1, length_seq2 = length_seq2, overlap = overlap, commons = commons, num_per_bp = num_per_bp)
end
########################################################################
# Codes for the paper:
# Map Marine Genetic Diversity
#
# Pierre-Edouard Guerin
# Montpellier, 2017
#
#########################################################################
# Load the necessary library and functions
include("Lib_Compare_Pairwise.jl")
using DataFrames
"""
A function to create master data matrices that are used to compute genetic diversity, assess data quality and do sensitivity analyses.
**Parameters**
* 'foldername' : The name of of a folder containing the data files. There must be files of two types (file extension 'fasta' and file extension 'coords') with the same filename, e.g. the species names (e.g. folder contents could be 'Bufo_bufo.fasta, Bufo_bufo.coords, Rana_arvalis.fasta, Rana_arvalis.coords', etc.). The .fasta files contain the sequences as an m x n integer matrix, where m is the number of sequences and n is the length of the alignments. The .coords files contain the geographic coordinates of the sequences, as an m x 2 floating point matrix with latitude in the first column and longitude in the second.
"""
function create_master_matrices(foldername::ASCIIString)
cluster_name = basename(foldername)
species_list = [x[1:(end-6)] for x in filter(st->contains(st, ".fasta"), readdir(foldername))] #identify unique file names ignoring the extension
num_files = size(species_list,1)
equalarea = latbands = gridcells = DataFrame(species = String[], cell = String[], seq1 = Int[], seq2 = Int[], length_seq1 = Int[], length_seq2 = Int[], overlap = Float64[], commons = Int[], num_per_bp = Float64[]) # Pre-initialize the DataFrame to ensure correct element types
for iter_file = 1:num_files
# read in the sequence data for one species
file_name_seq = joinpath(foldername, species_list[iter_file] * ".fasta")
species_seqs = readdlm(file_name_seq, Int)
length_seq = size(species_seqs,2)
# read the coordinates for one species
file_name_coords = joinpath(foldername, species_list[iter_file] * ".coords")
species_coords = readdlm(file_name_coords)
#LATITUDINAL BANDS
latbandi = 10 * floor(Int,species_coords[:,1]/10)
latband = [String("$a") for a in latbandi]
latbands = vcat(latbands, calcspecies(species_list[iter_file], species_seqs, latband))
#EQUAL AREA
file_name_equal = joinpath(foldername, species_list[iter_file] * ".equalareacoords")
equ = floor(Int, readdlm(file_name_equal)[:,3])
equ = [String("$a") for a in equ]
equalarea = vcat(equalarea, calcspecies(String(species_list[iter_file]), species_seqs, equ))
end
writetable("./07-master_matrices/$(cluster_name)_pairwise_equalarea.csv", equalarea)
writetable("./07-master_matrices/$(cluster_name)_pairwise_latbands.csv", latbands)
end
"""
A function to calculate the summary statistic for all sites (e.g. grid cell or biome) for one species.
**Parameters**
* 'species': A string with the name of the species
* 'species_seqs': A matrix where the rows are aligned genetic sequences, and columns are loci. Basepairs must be coded as 1, 2, 3 or 4, or with a 0 signifying that the locus is absent from the alignment.
* 'sitenames': A vector of strings with the names of the sites
"""
function calcspecies(species::String, species_seqs::Matrix{Int}, sitenames::Vector{String})
res = DataFrame(species = String[], cell = String[], seq1 = Int[], seq2 = Int[], length_seq1 = Int[], length_seq2 = Int[], overlap = Float64[], commons = Int[], num_per_bp = Float64[])
uniq_grid = unique(sitenames, 1)
for iter_grid = 1:size(uniq_grid,1) # Go through each site in turn
seqs_grid = species_seqs[findin(sitenames, uniq_grid[iter_grid, :]), :]
tot_muts = compare_pairwise(seqs_grid)
lns = size(tot_muts, 1)
tmp = DataFrame(species = fill(species, lns), cell = fill(uniq_grid[iter_grid, :][1], lns)) #Expand species and cell names to the length of the resulting DataFrame
tot_muts = hcat(tmp, tot_muts)
res = vcat(res, tot_muts)
end
for sym in [:seq1, :seq2, :length_seq1, :length_seq2, :overlap, :commons, :num_per_bp]
res[sym][res[sym] .< 0.] = NA #rep empty values with an NA identifier
end
res
end
include("Lib_Create_Master_Matrices.jl")
for cluster in ["total", "freshwater", "marine"]
create_master_matrices("./06-species_alnt_cluster/$(cluster)")
end
########################################################################
# Codes for the paper:
# An Anthropocene Map of Genetic Diversity
#
# Andreia Miraldo, Sen Li, Michael K. Borregaard, Alexander Floréz-Rodriguéz, Shyam Gopalakrishnan, Mirneza Risvanovic, Zhiheng Wang, Carsten Rahbek, Katharine A. Marske & David Nogués-Bravo
#
# Submitted to Science, 2016
# Code in this file by Michael K. Borregaard
# modified by P.E. GUERIN
# Montpellier 2017
#########################################################################
#Load the necessary library
using DataFrames, DataFramesMeta, StatsBase
"""
chaque ligne représente une cellule (chaque cellule est représentée par une seule ligne) et contient les coordonnées de le cellule, la moyenne (peut être aussi calculer la médiane) de la diversité génétique, et aussi à ajouter pour chaque ligne 1) le nombre d'espèces dans la cellule 2) le nombre moyen (et median) d'individu par espèces et (3) avec la standard deviation.
"""
function calcGDmeanmednumberind(dats)
#nb_indv
nb_ind = by(unique(dats[[:cell, :species, :seq1]]),[:cell, :species]) do df
tt = unique(df[:seq1])
size(tt,1)+1
end
ind_mean = by(nb_ind, :cell, x -> mean(x[:x1]))
ind_med = by(nb_ind, :cell, x -> median(x[:x1]))
ind_std = by(nb_ind, :cell, x -> std(x[:x1]))
#nb_species
richness = by(dats, :cell, totalspecies)
#GD
meanpaspec = by(dats, [:cell, :species], x -> mean(x[:num_per_bp]))
GD_mean = by(meanpaspec, :cell, x -> mean(x[:x1]))
GD_med = by(meanpaspec, :cell, x -> median(x[:x1]))
#total
total = hcat(GD_mean,GD_med[:x1], richness[:x1], ind_mean[:x1], ind_med[:x1],ind_std[:x1])
names!(total, [:site, :GD_mean, :GD_median,:nb_species, :nb_indv_mean, :nb_indv_median, :nb_indv_sd])
total
end
function calcGDspeciespersite(dats)
nb_ind = by(unique(dats[[:cell, :species, :seq1]]),[:cell, :species]) do df
tt = unique(df[:seq1])
size(tt,1)+1
end