Commit f401718a authored by Romain Feron's avatar Romain Feron
Browse files

Initial commit

parents
import os
from collections import defaultdict
from utils import get_info, parse_header, clean_split
MALES = 'males'
FEMALES = 'females'
HIGH = 'high'
LOW = 'low'
HALF_L = 'half_low'
HALF_H = 'half_high'
def write_output_file(data, file, correspondance):
# Refactorize this shit
file.write('#MALES\n')
for i, locus in enumerate(data[MALES]):
file.write(locus + '\t' + correspondance[locus] + '\n')
file.write('\n\n')
file.write('#FEMALES\n')
for i, locus in enumerate(data[FEMALES]):
file.write(locus)
if i < len(data[FEMALES]) - 1:
file.write(', ')
def analyse(file_path, catalog_path, output_dir, margin_ratios):
infos = get_info(file_path)
file = open(file_path)
o_frequencies_p = os.path.join(output_dir,
infos['file_base'] + '_frequencies.tsv')
o_specific_p = os.path.join(output_dir,
infos['file_base'] + '_sex_specific.txt')
o_polymorphic_p = os.path.join(output_dir,
infos['file_base'] + '_sex_polymorphic.txt')
o_environmental_p = os.path.join(output_dir,
infos['file_base'] + '_environmental.txt')
o_frequencies_f = open(o_frequencies_p, 'w')
o_specific_f = open(o_specific_p, 'w')
o_polymorphic_f = open(o_polymorphic_p, 'w')
o_environmental_f = open(o_environmental_p, 'w')
# Parse header to extract the names, sexes, and column number of samples
header = file.readline()
names, sexes, pos = parse_header(header)
total_males = len(sexes[MALES])
total_females = len(sexes[FEMALES])
# Calculate margins for each sex based on margin ratios defined by user
# Encapsulate this shit
margins = {MALES:
{LOW: round(int(margin_ratios[LOW] * total_males)),
HIGH: round(int(margin_ratios[HIGH] * total_males)),
HALF_L: round(int(margin_ratios[HALF_L] * total_males)),
HALF_H: round(int(margin_ratios[HALF_H] * total_males))},
FEMALES:
{LOW: round(int(margin_ratios[LOW] * total_females)),
HIGH: round(int(margin_ratios[HIGH] * total_females)),
HALF_L: round(int(margin_ratios[HALF_L] * total_females)),
HALF_H: round(int(margin_ratios[HALF_H] * total_females))}
}
sex_specific = {MALES: set(), FEMALES: set()}
sex_polymorphic = {MALES: set(), FEMALES: set()}
alleles_frequencies = {}
environmental = {MALES: set(), FEMALES: set()}
for line in file:
tabs = clean_split(line)
locus_id = tabs[0]
samples = tabs[pos[0]:pos[1]]
sex_count = {MALES: 0, FEMALES: 0}
sex_poly = {MALES: defaultdict(int), FEMALES: defaultdict(int)}
alleles_frequencies[locus_id] = tabs[1]
# Encapsulate this shit
for i, sample in enumerate(samples):
if sample != '-':
if names[i] in sexes[MALES]:
sex_count[MALES] += 1
sex_poly[MALES][sample] += 1
elif names[i] in sexes[FEMALES]:
sex_count[FEMALES] += 1
sex_poly[FEMALES][sample] += 1
# Refactorise this shit
if sex_count[MALES] >= margins[MALES][HIGH]:
if sex_count[FEMALES] <= margins[FEMALES][LOW]:
sex_specific[MALES].add(locus_id)
for site, count in sex_poly[MALES].items():
if (site not in sex_poly[FEMALES].keys() or
sex_poly[FEMALES][site] <= margins[FEMALES][LOW]):
if count >= margins[MALES][HIGH]:
sex_polymorphic[MALES].add(locus_id)
elif (count >= margins[MALES][HALF_L] and
count <= margins[MALES][HALF_H]):
environmental[MALES].add(locus_id)
if sex_count[FEMALES] >= margins[FEMALES][HIGH]:
if sex_count[MALES] <= margins[MALES][LOW]:
sex_specific[FEMALES].add(locus_id)
for site, count in sex_poly[FEMALES].items():
if (site not in sex_poly[MALES].keys() or
sex_poly[MALES][site] <= margins[MALES][LOW]):
if count >= margins[FEMALES][HIGH]:
sex_polymorphic[FEMALES].add(locus_id)
elif (count >= margins[FEMALES][HALF_L] and
count <= margins[FEMALES][HALF_H]):
environmental[FEMALES].add(locus_id)
for locus, frequency in alleles_frequencies.items():
o_frequencies_f.write(locus + '\t' + frequency + '\n')
catalog = open(catalog_path)
catalog.readline()
correspondance = {}
for line in catalog:
tabs = clean_split(line)
locus_id = tabs[2]
if locus_id in sex_specific[MALES]:
correspondance[locus_id] = tabs[8]
write_output_file(sex_specific, o_specific_f, correspondance)
# write_output_file(sex_polymorphic, o_polymorphic_f)
# write_output_file(environmental, o_environmental_f)
import os
import re
from collections import OrderedDict
from utils import list_files, get_info, next_batch, list_species
from analyses import analyse
HIGH = 'high'
LOW = 'low'
HALF_L = 'half_low'
HALF_H = 'half_high'
root_dir = '/home/rferon/work/analyses/multi_species/rad_seq/denovo_map'
haplotype_data_dir = os.path.join(root_dir, 'data')
results_dir = os.path.join(root_dir, 'results')
batch = next_batch(results_dir)
print(' - Batch number: ' + batch)
batch_dir = os.path.join(results_dir, 'batch_' + batch)
os.mkdir(batch_dir)
haplotype_files = list_files(haplotype_data_dir)
species_list = list_species(haplotype_files)
os.mkdir(os.path.join(batch_dir, 'species'))
for species in species_list:
os.mkdir(os.path.join(batch_dir, 'species', species))
os.mkdir(os.path.join(batch_dir, 'frequencies'))
os.mkdir(os.path.join(batch_dir, 'sex_specific'))
os.mkdir(os.path.join(batch_dir, 'sex_polymorphic'))
os.mkdir(os.path.join(batch_dir, 'environmental'))
margins = OrderedDict([(LOW, 0.05),
(HIGH, 0.95),
(HALF_L, 0.4),
(HALF_H, 0.6)])
margins_file = open(os.path.join(batch_dir, 'margins.txt'), 'w')
for margin, value in margins.items():
margins_file.write(margin + '\t' + str(value) + '\n')
print(' - Parsing files: ')
for f in haplotype_files:
infos = get_info(f)
print(' > ' + infos['species'] + '_' + infos['parameter'])
species_output = os.path.join(batch_dir, 'species', infos['species'])
analyse(f, species_output, margins)
for species in species_list:
species_output = os.path.join(batch_dir, 'species', species)
output_files = list_files(species_output)
for o in output_files:
reg = re.search(r'(\D)_(\D+)_(\d+)_(\D+).([tsvx]{3})', o)
if reg and len(reg.groups()) == 5:
o_type = reg.group(4)
os.system('ln -sf ' + o + ' ' + os.path.join(batch_dir, o_type))
from visualize.sex_variable import sex_variable
from analyze.analyses import analyse
spec_p = ('/home/rferon/work/analyses/multi_species/rad_seq/denovo_map/results/' +
'batch_1/species/a_melas/a_melas_10_sex_specific.txt')
poly_p = ('/home/rferon/work/analyses/multi_species/rad_seq/denovo_map/results/' +
'batch_1/sex_polymorphic/a_melas_10_sex_polymorphic.txt')
env_p = ('/home/rferon/work/analyses/multi_species/rad_seq/denovo_map/results/' +
'batch_1/environmental/a_melas_10_environmental.txt')
output_dir = '/home/rferon/work/analyses/multi_species/rad_seq/denovo_map/figures/'
data_dir = '/home/rferon/work/analyses/multi_species/rad_seq/denovo_map/data/tags/'
# frequencies(file_p, output_dir, 50)
sex_variable(spec_p, poly_p, env_p, data_dir, output_dir)
HIGH = 'high'
LOW = 'low'
HALF_L = 'half_low'
HALF_H = 'half_high'
margins = {LOW: 0.05,
HIGH: 0.95,
HALF_L: 0.4,
HALF_H: 0.6}
# analyse('./data/haplotypes/a_melas_denovo_10.tsv', './data/haplotypes/batch_1.catalog.tags.tsv', './results/batch_1/species/a_melas/', margins)
import os
import re
def clean_split(line, sep='\t'):
return line.rstrip('\n').split(sep)
def get_info(file):
base = os.path.splitext(os.path.split(file)[-1])[0]
path = '/'.join(os.path.split(file)[:-1])
tabs = base.split('_')
infos = {'species': tabs[0] + '_' + tabs[1],
'species_short': tabs[0][0] + '_' + tabs[1],
'parameter': tabs[3],
'file_base': '_'.join([tabs[0][0], tabs[1], tabs[3]]),
'path': path}
return infos
def list_files(directory, end=('.tsv', '.txt')):
files = [os.path.join(directory, f) for f in os.listdir(directory)
if os.path.isfile(os.path.join(directory, f)) and
f.endswith(end)]
return files
def parse_header(line):
tabs = clean_split(line)
names = []
sexes = {'males': [], 'females': []}
pos = [-1, -1]
for i, tab in enumerate(tabs):
individuals = re.search(r'(.+)_(.+)_(\D+)(\d+)', tab)
if individuals:
if pos[0] == -1:
pos[0] = i
else:
pos[1] = i
names.append(tab)
if individuals.group(3) == 'M':
sexes['males'].append(tab)
elif individuals.group(3) == 'F':
sexes['females'].append(tab)
return names, sexes, pos
def next_batch(directory):
dirs = [d for d in os.listdir(directory)
if os.path.isdir(os.path.join(directory, d))]
b = -1
for d in dirs:
s = d.split('_')
if 'batch' in s:
c = int(s[-1])
if c > b:
b = c
return(str(b + 1))
def list_species(files):
species = set()
for f in files:
species.add(get_info(f)['species'])
return species
import os
def frequencies(file, output_dir, n_individuals):
scripts_d = ''.join(os.path.split(os.path.realpath(__file__))[:-1])
cmd = ('Rscript ' + os.path.join(scripts_d, 'ggplot.R') + ' ' + file + ' ' +
output_dir + ' ' + str(n_individuals))
os.system(cmd)
#!/usr/bin/env Rscript
args = commandArgs(trailingOnly=TRUE)
# test if there is at least one argument: if not, return an error
if (length(args)==0) {
stop("At least one argument must be supplied (input file).n", call.=FALSE)
} else if (length(args) != 3){
stop("Usage: R ggplot.R input_file.tsv output_dir n_individuals")
}
suppressMessages(library(readr))
suppressMessages(library(ggplot2))
suppressMessages(library(svglite))
suppressMessages(library(scales))
file = args[1]
name = strsplit(file, '/')[[1]][length(strsplit(file, '/')[[1]])]
output_dir = args[2]
n_max = strtoi(args[3])
split = strsplit(name, '_')[[1]]
species_name = paste(split[1], "_", split[2], sep="")
png_name = paste(species_name, "_", split[4], ".png", sep='')
data = suppressMessages(read_delim(file, "\t", escape_double = FALSE, col_names = FALSE, trim_ws = TRUE))
species = rep(species_name, dim(data)[1])
m_value = rep(split[4],dim(data)[1])
colnames(data) = c("Stack_ID", "Frequency")
data = subset(data, data$Frequency <= n_max)
g = ggplot(data, aes(x=Frequency)) +
geom_bar(aes(y = (..count..)/sum(..count..)), colour="black", fill="#CCCCCC") +
geom_vline(xintercept = mean(data$Frequency), colour = "red") +
geom_vline(xintercept = median(data$Frequency), colour = "blue") +
ggtitle(paste("Total tags: ", dim(data)[1], sep='')) + theme(plot.title = element_text(hjust = 0.5)) +
xlab("Number of individuals in which a tag is present") + ylab("Frequency (%)") +
scale_y_continuous(labels=percent)
png(paste(output_dir, gsub(".tsv", "", name), ".png", sep=''), width=1600, height=800, res=100)
print(g)
x = dev.off()
import os
import gzip
from collections import defaultdict
from utils import clean_split, list_files
class File_infos:
def __init__(self, path):
self.path = path
self.base = (os.path.split(path)[-1]).split(os.extsep)[0]
self.tabs = self.base.split('_')
self.par = self.tabs[-1]
self.species = '_'.join(self.tabs[:1])
self.species_short = self.tabs[0][0] + '_' + self.tabs[1]
self.id = self.tabs[2]
self.sex = self.id[0]
self.number = self.id[1:]
def output_files(infos, output_dir, suffix):
male_p = infos['file_base'] + '_male_' + suffix + '.txt'
female_p = infos['file_base'] + '_female_' + suffix + '.txt'
male_f = open(os.path.join(output_dir, male_p), 'w')
female_f = open(os.path.join(output_dir, female_p), 'w')
return male_f, female_f
def load_tags(file):
f = open(file)
male = True
male_tags = defaultdict(lambda: defaultdict(str))
female_tags = defaultdict(lambda: defaultdict(str))
for line in f:
if "#MALES" in line:
male = True
elif "#FEMALES" in line:
male = False
elif line.strip() != '':
tabs = clean_split(line)
catalog_locus = tabs[0]
for i in tabs[1].split(','):
individual = i.split('_')[0]
individual_locus = i.split('_')[1]
if male:
male_tags[individual][individual_locus] = catalog_locus
else:
female_tags[individual][individual_locus] = catalog_locus
return male_tags, female_tags
def find_tag_files(infos, data_dir):
species = {'a_melas': 'ameirus_melas',
'a_gigas': 'arapaima_gigas'}
print(' - Finding individual tags files')
tags_dir = os.path.join(data_dir, species[infos['species_short']])
all_files = list_files(tags_dir, 'gz')
tag_files = set()
for t in all_files:
base = (os.path.split(t)[-1]).split(os.extsep)[0]
par = base.split('_')[-1]
if par == infos['parameter'] and 'batch' not in base:
tag_files.add(t)
return tag_files
def show_loci(loci_dict):
for k, v in loci_dict.items():
print('Locus: ', k)
for individual, data in v.items():
print('ID: ', individual)
print('Consensus: ', data['consensus'])
for allele_id, allele_data in data['alleles'].items():
print('Allele: ', allele_id)
print('Sequence: ', allele_data['sequence'])
print('Coverage: ', str(allele_data['coverage']))
print('\n')
print('\n\n')
def export_loci(loci_dict, file):
for k, v in loci_dict.items():
file.write('Locus: ' + k + '\n')
for individual, data in v.items():
file.write('ID: ' + individual + '\n')
file.write('Consensus: ' + data['consensus'] + '\n')
for allele_id, allele_data in data['alleles'].items():
file.write('Allele: ' + allele_id + '\n')
file.write('Sequence: ' + allele_data['sequence'] + '\n')
file.write('Coverage: ' + str(allele_data['coverage']) + '\n')
file.write('\n')
file.write('\n\n')
def extract_info(tabs, locus, id_n, d):
if id_n not in d[locus].keys():
d[locus][id_n] = {'consensus': '',
'alleles': {}}
seq_id = tabs[6] + tabs[7]
sequence = tabs[9]
if seq_id == 'consensus':
d[locus][id_n]['consensus'] = sequence
elif seq_id != 'model':
if seq_id not in d[locus][id_n]['alleles'].keys():
d[locus][id_n]['alleles'][seq_id] = {'sequence': sequence,
'coverage': 1}
else:
d[locus][id_n]['alleles'][seq_id]['coverage'] += 1
def process_file(file_infos, spec_dict, poly_dict, env_dict, spec_tags, poly_tags, env_tags):
g = gzip.open(file_infos.path, 'rt')
g.readline()
for line in g:
tabs = clean_split(line)
id_n = tabs[1]
locus = tabs[2]
if locus in spec_tags[id_n].keys():
extract_info(tabs, spec_tags[id_n][locus], file_infos.id, spec_dict)
# if locus in poly_tags:
# extract_info(tabs, locus, file_infos.id, poly_dict)
# if locus in poly_tags:
# extract_info(tabs, locus, file_infos.id, env_dict)
def sex_variable(file_spec, file_poly, file_env, data_dir, output_dir):
base = os.path.splitext(os.path.split(file_spec)[-1])[0]
path = '/'.join(os.path.split(file_spec)[:-1])
tabs = base.split('_')
infos = {'species': tabs[0] + '_' + tabs[1],
'species_short': tabs[0][0] + '_' + tabs[1],
'parameter': tabs[2],
'file_base': '_'.join(base.split('_')[:3]),
'path': path}
male_spec_f, female_spec_f = output_files(infos, output_dir, 'spec')
# male_poly_f, female_poly_f = output_files(infos, output_dir, 'poly')
# male_env_f, female_env_f = output_files(infos, output_dir, 'env')
print(' - Loading sex-specific tags')
male_spec_tags, female_spec_tags = load_tags(file_spec)
# print(' - Loading sex-polymorphic tags')
# male_poly_tags, female_poly_tags = load_tags(file_poly)
# print(' - Loading sex-polymorphic tags')
# male_env_tags, female_env_tags = load_tags(file_env)
male_poly_tags = {}
female_poly_tags = {}
male_env_tags = {}
female_env_tags = {}
tag_files = find_tag_files(infos, data_dir)
male_spec_loci = defaultdict(lambda: dict())
female_spec_loci = defaultdict(lambda: dict())
male_poly_loci = defaultdict(lambda: dict())
female_poly_loci = defaultdict(lambda: dict())
male_env_loci = defaultdict(lambda: dict())
female_env_loci = defaultdict(lambda: dict())
print(' - Extracting data from individual tags files')
n_files = str(len(tag_files))
for i, t in enumerate(tag_files):
file_infos = File_infos(t)
if file_infos.sex == 'M':
print(' - File ' + str(i + 1) + '/' + n_files + ' (Male) : ' + file_infos.base)
process_file(file_infos, male_spec_loci, male_poly_loci, male_env_loci,
male_spec_tags, male_poly_tags, male_env_tags)
elif file_infos.sex == 'F':
process_file(file_infos, female_spec_loci, female_poly_loci, female_env_loci,
female_spec_tags, female_poly_tags, female_env_loci)
print(' - File ' + str(i + 1) + '/' + n_files + ' (Female) : ' + file_infos.base)
else:
pass
print(' - Exporting results')
export_loci(male_spec_loci, male_spec_f)
# export_loci(female_spec_loci, female_spec_f)
# export_loci(male_poly_loci, male_poly_f)
# export_loci(female_poly_loci, female_poly_f)
# export_loci(male_env_loci, male_env_f)
# export_loci(female_env_loci, female_env_f)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment