Commit b7551cc2 authored by Romain Feron's avatar Romain Feron
Browse files

Re-implemented haplotypes extraction from the catalog file.

For now, only implemented in loci_matrix, will be added to other components in the future. Eventually, there won't be a need for haplotypes file anymore
parent e5035dc0
from radseq_analysis.file_handler.catalog import get_info_from_catalog
from radseq_analysis.file_handler.haplotypes import get_haplotypes
from radseq_analysis.file_handler.catalog import get_haplotypes
from radseq_analysis.file_handler.individual_files import get_individual_sequences
from radseq_analysis.file_handler.popmap import load_popmap
from radseq_analysis.file_handler.positions import load_positions_list
......
......@@ -65,3 +65,34 @@ def get_info_from_catalog(catalog_path,
return correspondance_data
elif frequencies:
return frequencies_data
def get_haplotypes(parameters):
'''
Extract haplotypes information, sorted by sex, from a catalog file
Input:
- path to a catalog file (batch_X.catalog.tsv)
Output:
- for each locus, haplotype for each individual
{ Locus ID: { sequence: sequence, individuals: { individual_id: individual_locus_id } }}
'''
catalog = open_all(parameters.catalog_file_path)
catalog.readline()
haplotypes_data = {}
for line in catalog:
tabs = line.split('\t')
locus_id = tabs[2]
indiv_ids = tabs[8].split(',')
haplotypes_data[locus_id] = {}
haplotypes_data[locus_id]['sequence'] = tabs[9]
haplotypes_data[locus_id]['individuals'] = {}
for individual in indiv_ids:
haplotypes_data[locus_id]['individuals'][individual.split('_')[0]] = individual.split('_')[1]
catalog.close()
return haplotypes_data
from radseq_analysis.shared.commons import *
from radseq_analysis import file_handler
from radseq_analysis import output
......@@ -6,17 +5,19 @@ from radseq_analysis import output
def fill_loci_matrix(parameters):
print(' - Loading haplotypes from file ...')
numbers = file_handler.get_haplotypes(parameters, haplotypes=False, numbers=True)
haplotypes = file_handler.get_haplotypes(parameters)
loci_matrix = [[0 for x in range(int(parameters.n_males) + 1)] for
y in range(int(parameters.n_females) + 1)]
print(' - Generating loci matrix ...')
for locus_id, data in numbers.items():
for tag, tag_numbers in data.items():
if tag != '-':
loci_matrix[tag_numbers[FEMALES]][tag_numbers[MALES]] += 1
for locus_id, data in haplotypes.items():
n_males = len({i for i in data['individuals'].keys() if
parameters.popmap[parameters.order[i]] is 'M'})
n_females = len({i for i in data['individuals'].keys() if
parameters.popmap[parameters.order[i]] is 'F'})
loci_matrix[n_females][n_males] += 1
print(' - Generating output ...')
......
from radseq_analysis.parser.parser import Parser
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment