Commit dfb00de3 authored by Romain Feron's avatar Romain Feron
Browse files

Implemented individual coverage module

parent 74e9d630
......@@ -4,4 +4,5 @@ from radseq_analysis.file_handler.individual_files import get_individual_sequenc
from radseq_analysis.file_handler.popmap import load_popmap
from radseq_analysis.file_handler.positions import load_positions_list
from radseq_analysis.file_handler.sequences import get_sequences
from radseq_analysis.file_handler.markers import get_markers
from radseq_analysis.file_handler.coverage import get_coverage
def get_markers(markers_file_path):
'''
Extract information from a markers file (list of markers ID)
'''
markers_file = open(markers_file_path)
markers = [line[:-1] for line in markers_file if line[:-1]]
return markers
......@@ -2,5 +2,6 @@ from radseq_analysis.modules.sex_linked_haplotypes import analysis as sex_linked
from radseq_analysis.modules.loci_matrix import analysis as loci_matrix
from radseq_analysis.modules.stacks_privacy import analysis as stacks_privacy
from radseq_analysis.modules.rescue import analysis as rescue
from radseq_analysis.modules.individual_coverage import analysis as individual_coverage
from radseq_analysis.modules.visualize import analysis as visualization
from radseq_analysis.modules.analysis import analysis
......@@ -4,6 +4,7 @@ from radseq_analysis.modules import sex_linked_haplotypes
from radseq_analysis.modules import loci_matrix
from radseq_analysis.modules import stacks_privacy
from radseq_analysis.modules import rescue
from radseq_analysis.modules import individual_coverage
from radseq_analysis.modules import visualization
from radseq_analysis.file_handler import load_popmap
from radseq_analysis.file_handler import load_positions_list
......@@ -16,6 +17,7 @@ def analysis(input_dir=None,
positions_file_path=None,
sequences_file_path=None,
coverage_file_path=None,
markers_file_path=None,
analysis=None):
parameters = Parameters(files_dir=input_dir,
......@@ -50,5 +52,7 @@ def analysis(input_dir=None,
stacks_privacy(catalog_file_path, parameters)
elif analysis == 'rescue':
rescue(sequences_file_path, catalog_file_path, individual_files_paths, coverage_file_path, parameters)
elif analysis == 'coverage':
individual_coverage(markers_file_path, catalog_file_path, individual_files_paths, coverage_file_path, parameters)
elif analysis == 'visualize':
visualization(input_file_path, popmap_file_path, output_file_path, parameters)
from radseq_analysis import file_handler
from radseq_analysis import output
from radseq_analysis.shared import Stack
import os
from collections import defaultdict
def get_individual_names(individual_files_paths):
return [os.path.split(file)[1].split('.')[0] for file in individual_files_paths]
def initialize_markers(markers_list):
markers = {}
for marker_id in markers_list:
s = Stack()
s.add_haplotype(marker_id)
markers[marker_id] = s
return markers
def get_individual_data(individual_files_paths, correspondance, bar=False):
individual_data = {}
try:
from progress.bar import Bar
bar = True
except ImportError:
bar = False
if bar:
progress_bar = Bar(' - Extracting individual data :', max=len(individual_files_paths))
else:
print(' - Extracting individual data ...')
for individual_file_path in individual_files_paths:
if bar:
progress_bar.next()
name = os.path.split(individual_file_path)[1].split('.')[0]
data = file_handler.get_individual_sequences(individual_file_path,
correspondance)
individual_data[name] = data
if bar:
print()
return individual_data
def fill_individual_data(markers, individual_data, coverage):
for stack_id, stack in markers.items():
for haplotype_id, haplotype in stack.haplotypes.items():
temp = defaultdict(int)
for name, data in individual_data.items():
if haplotype_id in data.keys():
if coverage:
temp[name] = int(int(data[haplotype_id]) / coverage[name])
else:
temp[name] = data[haplotype_id]
else:
temp[name] = 0
markers[stack_id].haplotypes[haplotype_id].individuals = temp
def analysis(markers_file_path, catalog_file_path,
individual_files_paths, coverage_file_path, global_parameters):
print(' - Loading extracted markers and catalog data ...')
coverage = None
if coverage_file_path:
coverage = file_handler.get_coverage(coverage_file_path)
markers_list = file_handler.get_markers(markers_file_path)
correspondance = file_handler.get_info_from_catalog(catalog_file_path,
loci_list=markers_list,
consensus=False,
correspondance=True)
individual_names = get_individual_names(individual_files_paths)
print(' - Creating stacks ...')
markers = initialize_markers(markers_list)
individual_data = get_individual_data(individual_files_paths, correspondance)
print(' - Merging individual data in stacks ...')
fill_individual_data(markers, individual_data, coverage)
output.markers(global_parameters.output_file_path, markers, individual_names)
......@@ -2,3 +2,4 @@ from radseq_analysis.output.loci_matrix import loci_matrix
from radseq_analysis.output.sex_linked_haplotypes import sex_linked_haplotypes
from radseq_analysis.output.stacks_privacy import stacks_privacy
from radseq_analysis.output.stacks import stacks
from radseq_analysis.output.markers import markers
# TODO: sort output by marker and individuals ?
def markers(output_file_path, markers_data, individual_names):
'''
Output markers data in the following format:
TODO
'''
output_file = open(output_file_path, 'w')
output_file.write('Marker_ID' + '\t')
output_file.write('\t'.join(individual_names) + '\n')
for marker_id, marker in markers_data.items():
output_file.write(marker_id + '\t')
output_file.write('\t'.join([str(marker.haplotypes[marker_id].individuals[name]) for
name in individual_names]) + '\n')
......@@ -15,6 +15,7 @@ Command: heatmap\tGenerates a matrix of haplotypes sex distribution
\t haplotypes\tExtract haplotypes present in a given number of males and females
\t frequencies\tCalculate haplotypes frequencies distribution in the population
\t rescue\tRegroup stacks into alleles after analysis
\t coverage\tExtract individual coverage for a set of markers
\t visualize\tVisualize analyses results using R
'''
)
......@@ -169,6 +170,43 @@ Options: -i\t--input-folder\tPath to a folder containing the output of denovo_m
coverage_file_path=args.coverage_file,
analysis='rescue')
def coverage(self):
parser = argparse.ArgumentParser(
description='Extract individual coverage for a set of markers',
usage='''python3 radseq_analysis.py coverage -i input_folder -a markers_file [-c coverage_file -o output_file]
Options: -i\t--input-folder\tPath to a folder containing the output of denovo_map
\t -a\t--markers\tPath to markers file (list of markers to extract)
\t -c\t--coverage-file\tPath to a coverage file (result of coverage analysis)
\t -o\t--output-file\tPath to output file (default: markers_coverage.tsv)
''')
parser.add_argument('--input-folder', '-i',
help='Path to a folder containing the output of denovo_map')
parser.add_argument('--markers', '-a',
help='Path to markers file')
parser.add_argument('--coverage-file', '-c',
help='Path to coverage file', nargs='?')
parser.add_argument('--output-file', '-o',
help='Path to output file', nargs='?',
default='extracted_alleles.tsv')
args = parser.parse_args(sys.argv[2:])
if not args.input_folder or not os.path.isdir(args.input_folder):
print('\nError: no valid input folder specified\n')
parser.print_usage()
print()
exit(1)
if not args.markers or not os.path.isfile(args.markers):
print('\nError: no valid markers file specified\n')
parser.print_usage()
print()
exit(1)
analysis(input_dir=args.input_folder,
markers_file_path=args.markers,
output_file_path=args.output_file,
coverage_file_path=args.coverage_file,
analysis='coverage')
def visualize(self):
parser = argparse.ArgumentParser(
description='Visualize analyses results using R',
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment