Commit 2d1d170a authored by Romain Feron's avatar Romain Feron
Browse files

Added command line with number of markers to the markers table. First...

Added command line with number of markers to the markers table. First re-implementation of 'loci'. Added zenodo DOI to readme
parent f3ac4bdb
[![DOI](https://zenodo.org/badge/86720601.svg)](https://zenodo.org/badge/latestdoi/86720601)
# RADSex
Current pre-release : 0.2.0
The RADSex pipeline is **currently under development** and has not been officially released yet.
Missing features are been implemented, some bugs are to be expected in this current development version, and parameters are subject to change.
Please contact me by email or on Github, or open an issue if you encounter bugs or would like to discuss a feature !
......
#include "depth.h"
#include <iostream>
void depth(Parameters& parameters) {
......@@ -9,21 +10,35 @@ void depth(Parameters& parameters) {
if (input_file.is_open()) {
// First line is the header. The header is parsed to get the sex of each field in the table.
std::vector<std::string> line, header;
std::vector<std::string> line;
std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header. The header is parsed to get the sex of each field in the table.
std::getline(input_file, temp);
header = split(temp, "\t");
line = split(temp, "\t");
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
std::unordered_map<uint, uint> sex_columns = get_column_sex(popmap, line);
// Total number of reads in each individual
std::unordered_map<std::string, uint> individual_depths;
// Individual depths for a single marker
std::unordered_map<std::string, ulong> marker_depths;
// Table of depth frequencies
std::unordered_map<std::string, std::vector<ulong>> median_depth;
// Define variables used to read the file
char buffer[65536];
uint k = 0, field_n = 0;
sd_table results;
ulong depth = 0;
uint n_individuals = 0;
do {
......@@ -37,16 +52,33 @@ void depth(Parameters& parameters) {
switch(buffer[i]) {
case '\t': // New field
if (sex_columns[field_n] != 2) individual_depths[header[field_n]] += std::stoul(temp); // Increment the appropriate counter
if (sex_columns[field_n] != 2) {
depth = std::min(static_cast<ulong>(500), std::stoul(temp));
individual_depths[line[field_n]] += depth;
marker_depths[line[field_n]] = depth;
if(depth > 0) ++n_individuals;
}
temp = "";
++field_n;
break;
case '\n': // New line (also a new field)
if (sex_columns[field_n] != 2) individual_depths[header[field_n]] += std::stoul(temp); // Increment the appropriate counter
if (sex_columns[field_n] != 2) {
depth = std::min(static_cast<ulong>(500), std::stoul(temp));
individual_depths[line[field_n]] += depth;
marker_depths[line[field_n]] = depth;
if(depth > 1) ++n_individuals;
}
if (n_individuals == marker_depths.size()) { // Only keep markers present in all individuals to estimate expected depths
for (auto m: marker_depths) {
median_depth[m.first].push_back(m.second);
}
}
// Reset variables
temp = "";
field_n = 0;
n_individuals = 0;
break;
default:
......@@ -64,12 +96,36 @@ void depth(Parameters& parameters) {
if (output_file.is_open()) {
output_file << "Individual\tSex\tMarkers\n";
output_file << "Individual\tSex\tMarkers\tDepth\n";
for (auto i: median_depth) {
for (auto i: individual_depths) {
output_file << i.first << "\t";
(popmap[i.first]) ? output_file << "M" : output_file << "F";
output_file << "\t" << i.second << '\n';
output_file << "\t" << individual_depths[i.first] << "\t";
std::sort(i.second.begin(), i.second.end()); // Sort depth vector for easy median finding
if (i.second.size() % 2 == 0) { // Find median for an even sized vector (average of two possible median points)
const auto median_it1 = i.second.begin() + i.second.size() / 2 - 1;
const auto median_it2 = i.second.begin() + i.second.size() / 2;
std::nth_element(i.second.begin(), median_it1 , i.second.end());
const auto e1 = *median_it1;
std::nth_element(i.second.begin(), median_it2 , i.second.end());
const auto e2 = *median_it2;
output_file << (e1 + e2) / 2 << "\n";
} else { // Find median for odd sized vector
const auto median_it = i.second.begin() + i.second.size() / 2;
std::nth_element(i.second.begin(), median_it , i.second.end());
output_file << *median_it << "\n";
}
}
}
}
......
#pragma once
#include <algorithm>
#include <fstream>
#include <string>
#include <unordered_map>
......
......@@ -19,9 +19,15 @@ void distrib(Parameters& parameters) {
if (input_file) {
// First line is the header. The header is parsed to get the sex of each field in the table.
std::vector<std::string> line;
std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header. The header is parsed to get the sex of each field in the table.
std::getline(input_file, temp);
line = split(temp, "\t");
......@@ -59,6 +65,7 @@ void distrib(Parameters& parameters) {
field_n = 0;
sex_count[0] = 0;
sex_count[1] = 0;
sex_count[2] = 0;
break;
default:
......
......@@ -14,9 +14,17 @@ void freq(Parameters &parameters) {
std::map<uint, uint> results; // Results --> {Frequency: count}
// First line of input file is the header, which is not used in this analysis
std::vector<std::string> line;
std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header, not used in this analysis
std::getline(input_file, temp);
line = split(temp, "\t");
// Define variables used to read the file
char buffer[65536];
......@@ -66,4 +74,5 @@ void freq(Parameters &parameters) {
output_file.close();
input_file.close();
}
}
#include "loci.h"
//#include "loci.h"
void loci(Parameters& parameters) {
//void loci(Parameters& parameters) {
std::mutex results_mutex;
// std::mutex results_mutex;
std::unordered_map<std::string, std::vector<Locus>> results;
// std::unordered_map<std::string, std::vector<Locus>> results;
std::vector<std::string> header;
// std::vector<std::string> header;
// std::cout << " - Loading subset of loci :" << std::endl;
// par = "input_file_path";
// std::string input_file_path = parameters.get_value_from_name<std::string>(par);
// std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header);
//// std::cout << " - Loading subset of loci :" << std::endl;
//// par = "input_file_path";
//// std::string input_file_path = parameters.get_value_from_name<std::string>(par);
//// std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header);
// std::cout << " - Loading coverage table :" << std::endl;
//// std::cout << " - Loading coverage table :" << std::endl;
// std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header);
//// std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header);
// par = "max_distance";
// int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
//// par = "max_distance";
//// int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
// std::cout << std::endl << " - Starting alignments" << std::endl;
//// std::cout << std::endl << " - Starting alignments" << std::endl;
// auto sequence = sequences.begin();
//// auto sequence = sequences.begin();
// // Create and run all sequence processors
// par = "n_threads";
// std::vector<std::thread> threads;
// for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
// threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
// }
// for (auto &t : threads) t.join();
// par = "output_file_path";
// std::string output_file_path = parameters.get_value_from_name<std::string>(par);
// output_group_loci(output_file_path, results, header);
}
//// // Create and run all sequence processors
//// par = "n_threads";
//// std::vector<std::thread> threads;
//// for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
//// threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
//// }
//// for (auto &t : threads) t.join();
//// par = "output_file_path";
//// std::string output_file_path = parameters.get_value_from_name<std::string>(par);
//// output_group_loci(output_file_path, results, header);
//}
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom) {
/* Load a coverage matrix in memory
*/
std::ifstream input_file(file_path);
std::vector<Locus> coverage_matrix;
//std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom) {
if (input_file) {
// /* Load a coverage matrix in memory
// */
// First line is the header. The header is used when printing the output.
std::string temp;
std::getline(input_file, temp);
header = split(temp,"\t");
temp = "";
// std::ifstream input_file(file_path);
// std::vector<Locus> coverage_matrix;
// Define variables used to read the file
char buffer[65536];
uint k = 0, field_n = 0, seq_count = 0;
Locus locus;
bool keep_locus = false;
// if (input_file) {
do {
// // First line is the header. The header is used when printing the output.
// std::string temp;
// std::getline(input_file, temp);
// header = split(temp,"\t");
// temp = "";
// Read a chunk of size given by the buffer
input_file.read(buffer, sizeof(buffer));
k = static_cast<uint>(input_file.gcount());
// // Define variables used to read the file
// char buffer[65536];
// uint k = 0, field_n = 0, seq_count = 0;
// Locus locus;
// bool keep_locus = false;
for (uint i=0; i<k; ++i) {
// do {
// Read the buffer character by character
switch(buffer[i]) {
// // Read a chunk of size given by the buffer
// input_file.read(buffer, sizeof(buffer));
// k = static_cast<uint>(input_file.gcount());
case '\t': // New field
// for (uint i=0; i<k; ++i) {
switch(field_n) {
case 0:
locus.id = temp;
break;
case 1:
locus.sequence =temp;
break;
default:
locus.coverage.push_back(temp);
break;
}
// // Read the buffer character by character
// switch(buffer[i]) {
temp = "";
++field_n;
break;
// case '\t': // New field
case '\n': // New line (also a new field)
// switch(field_n) {
// case 0:
// locus.id = temp;
// break;
// case 1:
// locus.sequence =temp;
// break;
// default:
// locus.coverage.push_back(temp);
// break;
// }
locus.coverage.push_back(temp);
keep_locus = false;
// temp = "";
// ++field_n;
// break;
for (auto c: locus.coverage) {
if (std::stoi(c) > min_cov) {
keep_locus = true;
break;
}
}
// case '\n': // New line (also a new field)
// locus.coverage.push_back(temp);
// keep_locus = false;
// for (auto c: locus.coverage) {
// if (std::stoi(c) > min_cov) {
// keep_locus = true;
// break;
// }
// }
if (keep_locus) coverage_matrix.push_back(locus);
if (seq_count % 1000000 == 0 and seq_count > 0 and print) std::cout << " - Processed " << seq_count / 1000000 <<
" M. lines and stored " << coverage_matrix.size() << " sequences\n";
// Reset variables
locus.id = "";
locus.sequence = "";
locus.coverage.resize(0);
temp = "";
field_n = 0;
++seq_count;
break;
// if (keep_locus) coverage_matrix.push_back(locus);
// if (seq_count % 1000000 == 0 and seq_count > 0 and print) std::cout << " - Processed " << seq_count / 1000000 <<
// " M. lines and stored " << coverage_matrix.size() << " sequences\n";
// // Reset variables
// locus.id = "";
// locus.sequence = "";
// locus.coverage.resize(0);
// temp = "";
// field_n = 0;
// ++seq_count;
// break;
default:
temp += buffer[i];
break;
}
}
} while (input_file);
}
return coverage_matrix;
}
// default:
// temp += buffer[i];
// break;
// }
// }
// } while (input_file);
// }
// return coverage_matrix;
//}
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
//void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
// std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
while(sequence != sequences.end()) {
// while(sequence != sequences.end()) {
results_mutex.lock();
Locus locus = *sequence;
++sequence;
results_mutex.unlock();
process_sequence(locus, coverage_matrix, results, results_mutex, max_distance);
}
// results_mutex.lock();
// Locus locus = *sequence;
// ++sequence;
// results_mutex.unlock();
// process_sequence(locus, coverage_matrix, results, results_mutex, max_distance);
// }
}
//}
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
//void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
uint seq_len = static_cast<uint>(locus.sequence.size());
std::unordered_map<std::string, std::vector<Locus>> temp_results;
const char* seq = locus.sequence.c_str();
// uint seq_len = static_cast<uint>(locus.sequence.size());
// std::unordered_map<std::string, std::vector<Locus>> temp_results;
// const char* seq = locus.sequence.c_str();
EdlibAlignResult alignment_result;
for (auto l: coverage_matrix) {
alignment_result = edlibAlign(seq, seq_len, l.sequence.c_str(), seq_len, edlibDefaultAlignConfig());
if (alignment_result.status == EDLIB_STATUS_OK and alignment_result.editDistance < max_distance) {
temp_results[locus.id].push_back(l);
}
edlibFreeAlignResult(alignment_result);
}
// EdlibAlignResult alignment_result;
// for (auto l: coverage_matrix) {
// alignment_result = edlibAlign(seq, seq_len, l.sequence.c_str(), seq_len, edlibDefaultAlignConfig());
// if (alignment_result.status == EDLIB_STATUS_OK and alignment_result.editDistance < max_distance) {
// temp_results[locus.id].push_back(l);
// }
// edlibFreeAlignResult(alignment_result);
// }
results_mutex.lock();
for (auto l: temp_results) results[l.first] = l.second;
results_mutex.unlock();
}
// results_mutex.lock();
// for (auto l: temp_results) results[l.first] = l.second;
// results_mutex.unlock();
//}
void filter(std::unordered_map<std::string, std::vector<Locus>>& results) {
//void filter(std::unordered_map<std::string, std::vector<Locus>>& results) {
}
//}
#pragma once
#include <fstream>
#include <mutex>
#include <numeric>
#include <string>
#include <vector>
#include <thread>
......@@ -13,13 +14,3 @@
// Main function implementing the analysis
void loci(Parameters& parameters);
// Load the coverage matrix in memory
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_depth, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom);
// Pick the next sequence from the sequences vector and process it
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
// Process a sequence (look for similar sequences)
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
#include "loci.h"
void loci(Parameters& parameters) {
std::ofstream output_file;
output_file.open(parameters.output_file_path);
// Load popmap
std::unordered_map<std::string, bool> popmap = load_popmap(parameters);
uint n_indiv_total[2] {0, 0};
for (auto i: popmap) ++n_indiv_total[i.second];
uint min_indiv[2] {uint(n_indiv_total[0] * parameters.loci_min_individual_frequency), uint(n_indiv_total[1] * parameters.loci_min_individual_frequency)};
// Load individual depths from depth file
std::vector<std::string> header;
std::unordered_map<std::string, float> individual_depths_table;
std::ifstream depths_file;
depths_file.open(parameters.depths_file_path);
std::string temp = "";
std::getline(depths_file, temp); // Header line not used in this analysis
if (depths_file) {
while (std::getline(depths_file, temp)) {
header = split(temp, "\t");
if (header.size() == 4) individual_depths_table[header[0]] = std::stof(header[2]);
}
}
float sum = 0;
for (auto i: individual_depths_table) sum += i.second;
sum /= (n_indiv_total[0] + n_indiv_total[1]);
for (auto& i: individual_depths_table) i.second /= sum;
// Load subset of markers
std::ifstream markers_subset_file;
std::vector<std::string> fields;
markers_subset_file.open(parameters.subset_file_path);
std::vector<Locus> markers_subset;
bool heterogametic_sex = 0; // 0 --> females, 1 --> males
uint sex_count[2] {0, 0};
float sex_bias = 0;
uint n_markers = 0;
std::getline(markers_subset_file, temp); // Header line
header = split(temp, "\t");
while(std::getline(markers_subset_file, temp)) {
fields = split(temp, "\t");
Locus locus;
locus.id = fields[0];
locus.sequence = fields[1];
for (uint i=2; i< fields.size(); ++i) locus.coverage.push_back(fields[i]);
locus.coverage = std::vector<std::string>(fields.begin() + 2, fields.end());
markers_subset.push_back(locus);
for (uint i=0; i< locus.coverage.size(); ++i) {
sex_count[popmap[header[i+2]]] += (std::stoul(locus.coverage[i]) > parameters.min_depth);
}
sex_bias += float(sex_count[1]) / float(n_indiv_total[1]) - float(sex_count[0]) / float(n_indiv_total[0]); // Average sex bias : > 0 when male-biased and < 0 when female-biased
++n_markers;
sex_count[0] = 0;
sex_count[1] = 0;
}
sex_bias /= n_markers;
heterogametic_sex = (sex_bias < 0); // We don't handle sex-bias = 0 for now.
// Get alleles with sex-biased depths from the markers depth table
std::ifstream markers_table_file;
markers_table_file.open(parameters.markers_table_path);
std::vector<Locus> potential_markers;
float rel_depth = 0.0;
if (markers_table_file) {
// First line is a comment with number of markers in the table
std::getline(markers_table_file, temp);
header = split(temp, " : ");
if (header.size() == 2) uint n_markers = static_cast<uint>(std::stoi(header[1]));
// Second line is the header. The header is used when printing the output.
std::getline(markers_table_file, temp);
output_file << temp << std::endl;
header = split(temp,"\t");
std::vector<ulong> individual_depths(header.size());
for (uint i=0; i<header.size(); ++i) { // Transform individual depths table in a vector with indexing corresponding to field index in the markers table
(individual_depths_table.find(header[i]) != individual_depths_table.end()) ? individual_depths[i] = individual_depths_table[header[i]] : individual_depths[i] = 0;
}
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
std::unordered_map<uint, uint> sex_columns = get_column_sex(popmap, header);
// Define variables used to read the file
temp = "";
char buffer[65536];
uint k = 0, field_n = 0, seq_count = 0;
Locus locus;
float relative_depth[2] {0.0, 0.0};
uint n_individuals_marker[2] {0, 0};
do {
// Read a chunk of size given by the buffer
markers_table_file.read(buffer, sizeof(buffer));
k = static_cast<uint>(markers_table_file.gcount());
for (uint i=0; i<k; ++i) {
// Read the buffer character by character
switch(buffer[i]) {
case '\t': // New field