Commit e5035dc0 authored by Romain Feron's avatar Romain Feron
Browse files

Moved file detection to parameters object. Streamlined function parameters,...

Moved file detection to parameters object. Streamlined function parameters, most things are stored in global parameters object

Tested quickly on frequencies, haplotypes, and heatmap
parent 51973f9b
......@@ -3,7 +3,7 @@ from radseq_analysis.shared.commons import *
from radseq_analysis.file_handler.file_open import open_all
def get_haplotypes(haplotypes_file_path, global_parameters, haplotypes=True, numbers=True):
def get_haplotypes(parameters, haplotypes=True, numbers=True):
'''
Extract haplotypes information, sorted by sex, from a haplotypes file
......@@ -16,7 +16,7 @@ def get_haplotypes(haplotypes_file_path, global_parameters, haplotypes=True, num
{ Locus ID: { Haplotype: { Males: N, Females: M } } }
'''
haplotypes_file = open_all(haplotypes_file_path)
haplotypes_file = open_all(parameters.haplotypes_file_path)
line = haplotypes_file.readline()
names = line[:-1].split('\t')[2:]
......@@ -36,10 +36,10 @@ def get_haplotypes(haplotypes_file_path, global_parameters, haplotypes=True, num
if numbers:
tags = defaultdict(lambda: {MALES: 0, FEMALES: 0})
for individual, haplotype in temp.items():
if individual in global_parameters.popmap.keys():
if global_parameters.popmap[individual] is 'M':
if individual in parameters.popmap.keys():
if parameters.popmap[individual] is 'M':
tags[haplotype][MALES] += 1
elif global_parameters.popmap[individual] is 'F':
elif parameters.popmap[individual] is 'F':
tags[haplotype][FEMALES] += 1
haplotypes_numbers[locus_id] = tags
......
from radseq_analysis.file_handler.file_open import open_all
def get_individuals_order(log_path, global_parameters):
def get_individuals_order(log_path):
'''
Parse the denovo_map.log to find the order in which individuals were processed,
......@@ -34,4 +34,4 @@ def get_individuals_order(log_path, global_parameters):
individuals_order[str(order)] = line.split(':')[0]
order += 1
global_parameters.order = individuals_order
return individuals_order
......@@ -10,6 +10,4 @@ def get_markers(markers_file_path):
markers_file = open_all(markers_file_path)
markers = [line[:-1] for line in markers_file if line[:-1]]
print(markers)
return markers
......@@ -2,14 +2,11 @@ from radseq_analysis.file_handler.file_open import open_all
# TODO: verifications
def load_popmap(popmap_file_path, global_parameters):
def load_popmap(popmap_file_path):
popmap_file = open_all(popmap_file_path)
popmap = {line.split('\t')[0]: line.split('\t')[1][:-1] for
line in popmap_file if line[:-1]}
global_parameters.popmap = popmap
global_parameters.n_males = list(popmap.values()).count('M')
global_parameters.n_females = list(popmap.values()).count('F')
return [popmap, list(popmap.values()).count('M'), list(popmap.values()).count('F')]
from radseq_analysis.file_handler.file_open import open_all
def load_positions_list(positions_file_path, global_parameters):
def load_positions_list(positions_file_path):
positions = []
......@@ -15,4 +15,4 @@ def load_positions_list(positions_file_path, global_parameters):
print('Error: cannot read positions file')
exit(1)
global_parameters.positions_list = positions
return positions
import os
from radseq_analysis.shared import Parameters
from radseq_analysis.modules import sex_linked_haplotypes
from radseq_analysis.modules import loci_matrix
......@@ -6,12 +5,9 @@ from radseq_analysis.modules import stacks_privacy
from radseq_analysis.modules import rescue
from radseq_analysis.modules import individual_coverage
from radseq_analysis.modules import visualization
from radseq_analysis.file_handler import load_popmap
from radseq_analysis.file_handler import load_positions_list
def analysis(input_dir=None,
input_file_path=None,
popmap_file_path=None,
output_file_path=None,
positions_file_path=None,
......@@ -21,38 +17,24 @@ def analysis(input_dir=None,
analysis=None):
parameters = Parameters(files_dir=input_dir,
output_file_path=output_file_path)
popmap_file_path=popmap_file_path,
output_file_path=output_file_path,
positions_file_path=positions_file_path,
sequences_file_path=sequences_file_path,
coverage_file_path=coverage_file_path,
markers_file_path=markers_file_path)
species = None
if popmap_file_path:
load_popmap(popmap_file_path, parameters)
species = os.path.split(popmap_file_path)[1]
species = os.path.splitext(species)[0]
if len(species.split('_')) > 1:
species = '_'.join(s for s in species.split('_')[:-1])
parameters.species = species
if positions_file_path:
load_positions_list(positions_file_path, parameters)
# TODO: support more batch numbers + move this inside analyses?
if input_dir:
haplotypes_file_path = os.path.join(input_dir, 'batch_0.haplotypes.tsv')
catalog_file_path = os.path.join(input_dir, 'batch_0.catalog.tags.tsv.gz')
individual_files_paths = [os.path.join(input_dir, f) for
f in os.listdir(input_dir) if
'tags' in f and 'catalog' not in f]
parameters.get_files(input_dir)
if analysis == 'heatmap':
loci_matrix(haplotypes_file_path, parameters)
loci_matrix(parameters)
elif analysis == 'haplotypes':
sex_linked_haplotypes(haplotypes_file_path, catalog_file_path, parameters)
sex_linked_haplotypes(parameters)
elif analysis == 'frequencies':
stacks_privacy(catalog_file_path, parameters)
stacks_privacy(parameters)
elif analysis == 'rescue':
rescue(sequences_file_path, catalog_file_path, individual_files_paths, coverage_file_path, parameters)
rescue(parameters)
elif analysis == 'coverage':
individual_coverage(markers_file_path, catalog_file_path, individual_files_paths, coverage_file_path, parameters)
individual_coverage(parameters)
elif analysis == 'visualize':
visualization(input_file_path, popmap_file_path, output_file_path, parameters)
visualization(parameters)
......@@ -68,22 +68,21 @@ def fill_individual_data(markers, individual_data, coverage):
markers[stack_id].haplotypes[haplotype_id].individuals = temp
def analysis(markers_file_path, catalog_file_path,
individual_files_paths, coverage_file_path, global_parameters):
def analysis(parameters):
print(' - Loading extracted markers and catalog data ...')
coverage = None
if coverage_file_path:
coverage = file_handler.get_coverage(coverage_file_path)
markers_list = file_handler.get_markers(markers_file_path)
correspondance = file_handler.get_info_from_catalog(catalog_file_path,
if parameters.coverage_file_path:
coverage = file_handler.get_coverage(parameters.coverage_file_path)
markers_list = file_handler.get_markers(parameters.markers_file_path)
correspondance = file_handler.get_info_from_catalog(parameters.catalog_file_path,
loci_list=markers_list,
consensus=False,
correspondance=True)
individual_names = get_individual_names(individual_files_paths)
individual_names = get_individual_names(parameters.individual_files_paths)
print(' - Creating stacks ...')
markers = initialize_markers(markers_list)
individual_data = get_individual_data(individual_files_paths, correspondance)
individual_data = get_individual_data(parameters.individual_files_paths, correspondance)
print(' - Merging individual data in stacks ...')
fill_individual_data(markers, individual_data, coverage)
output.markers(global_parameters.output_file_path, markers, individual_names)
output.markers(parameters.output_file_path, markers, individual_names)
......@@ -3,13 +3,13 @@ from radseq_analysis import file_handler
from radseq_analysis import output
def fill_loci_matrix(haplotypes_file_path, global_parameters):
def fill_loci_matrix(parameters):
print(' - Loading haplotypes from file ...')
numbers = file_handler.get_haplotypes(haplotypes_file_path, global_parameters, haplotypes=False, numbers=True)
numbers = file_handler.get_haplotypes(parameters, haplotypes=False, numbers=True)
loci_matrix = [[0 for x in range(int(global_parameters.n_males) + 1)] for
y in range(int(global_parameters.n_females) + 1)]
loci_matrix = [[0 for x in range(int(parameters.n_males) + 1)] for
y in range(int(parameters.n_females) + 1)]
print(' - Generating loci matrix ...')
......@@ -23,7 +23,7 @@ def fill_loci_matrix(haplotypes_file_path, global_parameters):
return loci_matrix
def analysis(haplotypes_file_path, global_parameters):
def analysis(parameters):
loci_matrix = fill_loci_matrix(haplotypes_file_path, global_parameters)
output.loci_matrix(global_parameters.output_file_path, loci_matrix)
loci_matrix = fill_loci_matrix(parameters)
output.loci_matrix(parameters.output_file_path, loci_matrix)
......@@ -66,22 +66,21 @@ def fill_individual_data(stacks, individual_data, coverage):
stacks[stack_id].haplotypes[haplotype_id].individuals = temp
def analysis(sequences_file_path, catalog_file_path,
individual_files_paths, coverage_file_path, global_parameters):
def analysis(parameters):
print(' - Loading extracted sequences and catalog data ...')
coverage = None
if coverage_file_path:
coverage = file_handler.get_coverage(coverage_file_path)
sequences = file_handler.get_sequences(sequences_file_path)
consensus, correspondance = file_handler.get_info_from_catalog(catalog_file_path,
if parameters.coverage_file_path:
coverage = file_handler.get_coverage(parameters.coverage_file_path)
sequences = file_handler.get_sequences(parameters.sequences_file_path)
consensus, correspondance = file_handler.get_info_from_catalog(parameters.catalog_file_path,
consensus=True,
correspondance=True)
print(' - Finding similar sequences with blast ...')
blast_results = blast.get_matching_sequences(sequences, consensus, global_parameters.species)
blast_results = blast.get_matching_sequences(sequences, consensus, parameters.species)
print(' - Creating stacks ...')
stacks = make_stacks(blast_results, consensus)
individual_data = get_individual_data(individual_files_paths, correspondance)
individual_data = get_individual_data(parameters.individual_files_paths, correspondance)
print(' - Merging individual data in stacks ...')
fill_individual_data(stacks, individual_data, coverage)
output.stacks(global_parameters.output_file_path, stacks, global_parameters.popmap)
output.stacks(parameters.output_file_path, stacks, parameters.popmap)
......@@ -34,38 +34,38 @@ def locus_in_position_list(tags, positions_list):
return False
def filter(haplotypes_file_path, global_parameters):
def filter(parameters):
print(' - Loading haplotypes file ...')
haplotypes, numbers = file_handler.get_haplotypes(haplotypes_file_path, global_parameters)
haplotypes, numbers = file_handler.get_haplotypes(parameters)
loci_to_extract = {}
print(' - Filtering loci ...')
for locus_id, data in numbers.items():
if locus_in_position_list(data, global_parameters.positions_list):
if locus_in_position_list(data, parameters.positions_list):
locus = Locus()
locus.n_males = data['consensus'][MALES]
locus.n_females = data['consensus'][FEMALES]
male_majority, female_majority = get_majority_haplotypes(data)
locus.outliers[MALES] = set(i for i, v in haplotypes[locus_id].items() if
v != male_majority and global_parameters.popmap[i] is 'M')
v != male_majority and parameters.popmap[i] is 'M')
locus.outliers[FEMALES] = set(i for i, v in haplotypes[locus_id].items() if
v != female_majority and global_parameters.popmap[i] is 'F')
v != female_majority and parameters.popmap[i] is 'F')
loci_to_extract[locus_id] = locus
return loci_to_extract
def analysis(haplotypes_file_path, catalog_file_path, global_parameters):
def analysis(parameters):
loci_to_extract = filter(haplotypes_file_path, global_parameters)
consensus = file_handler.get_info_from_catalog(catalog_file_path, loci_to_extract)
loci_to_extract = filter(parameters)
consensus = file_handler.get_info_from_catalog(parameters.catalog_file_path, loci_to_extract)
for locus_id, sequence in consensus.items():
loci_to_extract[locus_id].sequence = sequence
output.sex_linked_haplotypes(global_parameters.output_file_path,
output.sex_linked_haplotypes(parameters.output_file_path,
loci_to_extract)
return loci_to_extract
......@@ -2,10 +2,10 @@ from radseq_analysis import file_handler
from radseq_analysis import output
def analysis(catalog_file_path, global_parameters):
def analysis(parameters):
frequencies = file_handler.get_info_from_catalog(catalog_file_path,
frequencies = file_handler.get_info_from_catalog(parameters.catalog_file_path,
consensus=False,
frequencies=True)
output.stacks_privacy(global_parameters.output_file_path, frequencies)
output.stacks_privacy(parameters.output_file_path, frequencies)
import re
import os
from radseq_analysis.file_handler import load_popmap
from radseq_analysis.file_handler import load_positions_list
from radseq_analysis.file_handler import get_individuals_order
class Parameters:
def __init__(self,
files_dir='',
positions_list=None,
output_individuals=False,
output_file_path=None,
frequencies_output_file='frequencies.tsv',
loci_matrix_output_file='haplotypes_matrix.tsv',
haplotypes_output_file='extracted_haplotypes.tsv',
individuals_output_file='individual_sequences.tsv',
positions_list=None,
popmap={},
order={},
n_males=0,
n_females=0,
species=None,
m_value=None
m_value=None,
popmap_file_path=None,
output_file_path=None,
positions_file_path=None,
sequences_file_path=None,
coverage_file_path=None,
markers_file_path=None
):
self.files_dir = files_dir
......@@ -31,3 +42,50 @@ class Parameters:
self.n_females = n_females
self.species = species
self.m_value = m_value
self.haplotypes_file_path = None
self.catalog_file_path = None
self.individual_files_paths = []
self.logs_file_path = None
self.popmap_file_path = popmap_file_path
self.output_file_path = output_file_path
self.positions_file_path = positions_file_path
self.sequences_file_path = sequences_file_path
self.coverage_file_path = coverage_file_path
self.markers_file_path = markers_file_path
def get_files(self, input_dir):
if not os.path.isdir(input_dir):
print('Error: invalid input directory (' + input_dir + ')\n')
exit()
files = [f for f in os.listdir(input_dir)]
for file in files:
haplotypes_file_re = re.search(r'batch_\d+\.haplotypes\.tsv', file)
if haplotypes_file_re:
self.haplotypes_file_path = os.path.join(input_dir, file)
catalog_file_re = re.search(r'batch_\d+\.catalog\.tags\.tsv', file)
if catalog_file_re:
self.catalog_file_path = os.path.join(input_dir, file)
individual_file_re = re.search(r'(.+)\.tags\.tsv', file)
if individual_file_re and 'catalog' not in individual_file_re.groups()[0]:
self.individual_files_paths.append(os.path.join(input_dir, file))
self.logs_file_path = os.path.join(input_dir, 'denovo_map.log')
if self.popmap_file_path and os.path.isfile(self.popmap_file_path):
self.popmap, self.n_males, self.n_females = load_popmap(self.popmap_file_path)
if self.positions_file_path and os.path.isfile(self.positions_file_path):
self.positions_list = load_positions_list(self.positions_file_path)
if self.logs_file_path and os.path.isfile(self.logs_file_path):
self.order = get_individuals_order(self.logs_file_path)
self.guess_species()
def guess_species(self):
if self.popmap_file_path:
self.species = os.path.split(self.popmap_file_path)[1]
self.species = os.path.splitext(self.species)[0]
if len(self.species.split('_')) > 1:
self.species = '_'.join(s for s in self.species.split('_')[:-1])
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment