Commit 2d1d170a authored by Romain Feron's avatar Romain Feron
Browse files

Added command line with number of markers to the markers table. First...

Added command line with number of markers to the markers table. First re-implementation of 'loci'. Added zenodo DOI to readme
parent f3ac4bdb
[![DOI](https://zenodo.org/badge/86720601.svg)](https://zenodo.org/badge/latestdoi/86720601)
# RADSex # RADSex
Current pre-release : 0.2.0
The RADSex pipeline is **currently under development** and has not been officially released yet. The RADSex pipeline is **currently under development** and has not been officially released yet.
Missing features are been implemented, some bugs are to be expected in this current development version, and parameters are subject to change. Missing features are been implemented, some bugs are to be expected in this current development version, and parameters are subject to change.
Please contact me by email or on Github, or open an issue if you encounter bugs or would like to discuss a feature ! Please contact me by email or on Github, or open an issue if you encounter bugs or would like to discuss a feature !
......
#include "depth.h" #include "depth.h"
#include <iostream>
void depth(Parameters& parameters) { void depth(Parameters& parameters) {
...@@ -9,21 +10,35 @@ void depth(Parameters& parameters) { ...@@ -9,21 +10,35 @@ void depth(Parameters& parameters) {
if (input_file.is_open()) { if (input_file.is_open()) {
// First line is the header. The header is parsed to get the sex of each field in the table. std::vector<std::string> line;
std::vector<std::string> line, header;
std::string temp = ""; std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header. The header is parsed to get the sex of each field in the table.
std::getline(input_file, temp); std::getline(input_file, temp);
header = split(temp, "\t"); line = split(temp, "\t");
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex) // Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
std::unordered_map<uint, uint> sex_columns = get_column_sex(popmap, line); std::unordered_map<uint, uint> sex_columns = get_column_sex(popmap, line);
// Total number of reads in each individual
std::unordered_map<std::string, uint> individual_depths; std::unordered_map<std::string, uint> individual_depths;
// Individual depths for a single marker
std::unordered_map<std::string, ulong> marker_depths;
// Table of depth frequencies
std::unordered_map<std::string, std::vector<ulong>> median_depth;
// Define variables used to read the file // Define variables used to read the file
char buffer[65536]; char buffer[65536];
uint k = 0, field_n = 0; uint k = 0, field_n = 0;
sd_table results; ulong depth = 0;
uint n_individuals = 0;
do { do {
...@@ -37,16 +52,33 @@ void depth(Parameters& parameters) { ...@@ -37,16 +52,33 @@ void depth(Parameters& parameters) {
switch(buffer[i]) { switch(buffer[i]) {
case '\t': // New field case '\t': // New field
if (sex_columns[field_n] != 2) individual_depths[header[field_n]] += std::stoul(temp); // Increment the appropriate counter if (sex_columns[field_n] != 2) {
depth = std::min(static_cast<ulong>(500), std::stoul(temp));
individual_depths[line[field_n]] += depth;
marker_depths[line[field_n]] = depth;
if(depth > 0) ++n_individuals;
}
temp = ""; temp = "";
++field_n; ++field_n;
break; break;
case '\n': // New line (also a new field) case '\n': // New line (also a new field)
if (sex_columns[field_n] != 2) individual_depths[header[field_n]] += std::stoul(temp); // Increment the appropriate counter if (sex_columns[field_n] != 2) {
depth = std::min(static_cast<ulong>(500), std::stoul(temp));
individual_depths[line[field_n]] += depth;
marker_depths[line[field_n]] = depth;
if(depth > 1) ++n_individuals;
}
if (n_individuals == marker_depths.size()) { // Only keep markers present in all individuals to estimate expected depths
for (auto m: marker_depths) {
median_depth[m.first].push_back(m.second);
}
}
// Reset variables // Reset variables
temp = ""; temp = "";
field_n = 0; field_n = 0;
n_individuals = 0;
break; break;
default: default:
...@@ -64,12 +96,36 @@ void depth(Parameters& parameters) { ...@@ -64,12 +96,36 @@ void depth(Parameters& parameters) {
if (output_file.is_open()) { if (output_file.is_open()) {
output_file << "Individual\tSex\tMarkers\n"; output_file << "Individual\tSex\tMarkers\tDepth\n";
for (auto i: median_depth) {
for (auto i: individual_depths) {
output_file << i.first << "\t"; output_file << i.first << "\t";
(popmap[i.first]) ? output_file << "M" : output_file << "F"; (popmap[i.first]) ? output_file << "M" : output_file << "F";
output_file << "\t" << i.second << '\n'; output_file << "\t" << individual_depths[i.first] << "\t";
std::sort(i.second.begin(), i.second.end()); // Sort depth vector for easy median finding
if (i.second.size() % 2 == 0) { // Find median for an even sized vector (average of two possible median points)
const auto median_it1 = i.second.begin() + i.second.size() / 2 - 1;
const auto median_it2 = i.second.begin() + i.second.size() / 2;
std::nth_element(i.second.begin(), median_it1 , i.second.end());
const auto e1 = *median_it1;
std::nth_element(i.second.begin(), median_it2 , i.second.end());
const auto e2 = *median_it2;
output_file << (e1 + e2) / 2 << "\n";
} else { // Find median for odd sized vector
const auto median_it = i.second.begin() + i.second.size() / 2;
std::nth_element(i.second.begin(), median_it , i.second.end());
output_file << *median_it << "\n";
}
} }
} }
} }
......
#pragma once #pragma once
#include <algorithm>
#include <fstream> #include <fstream>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
......
...@@ -19,9 +19,15 @@ void distrib(Parameters& parameters) { ...@@ -19,9 +19,15 @@ void distrib(Parameters& parameters) {
if (input_file) { if (input_file) {
// First line is the header. The header is parsed to get the sex of each field in the table.
std::vector<std::string> line; std::vector<std::string> line;
std::string temp = ""; std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header. The header is parsed to get the sex of each field in the table.
std::getline(input_file, temp); std::getline(input_file, temp);
line = split(temp, "\t"); line = split(temp, "\t");
...@@ -59,6 +65,7 @@ void distrib(Parameters& parameters) { ...@@ -59,6 +65,7 @@ void distrib(Parameters& parameters) {
field_n = 0; field_n = 0;
sex_count[0] = 0; sex_count[0] = 0;
sex_count[1] = 0; sex_count[1] = 0;
sex_count[2] = 0;
break; break;
default: default:
......
...@@ -14,9 +14,17 @@ void freq(Parameters &parameters) { ...@@ -14,9 +14,17 @@ void freq(Parameters &parameters) {
std::map<uint, uint> results; // Results --> {Frequency: count} std::map<uint, uint> results; // Results --> {Frequency: count}
// First line of input file is the header, which is not used in this analysis std::vector<std::string> line;
std::string temp = ""; std::string temp = "";
// First line is a comment with number of markers in the table
std::getline(input_file, temp);
line = split(temp, " : ");
if (line.size() == 2) uint n_markers = static_cast<uint>(std::stoi(line[1]));
// Second line is the header, not used in this analysis
std::getline(input_file, temp); std::getline(input_file, temp);
line = split(temp, "\t");
// Define variables used to read the file // Define variables used to read the file
char buffer[65536]; char buffer[65536];
...@@ -66,4 +74,5 @@ void freq(Parameters &parameters) { ...@@ -66,4 +74,5 @@ void freq(Parameters &parameters) {
output_file.close(); output_file.close();
input_file.close(); input_file.close();
} }
} }
#include "loci.h" //#include "loci.h"
void loci(Parameters& parameters) { //void loci(Parameters& parameters) {
std::mutex results_mutex; // std::mutex results_mutex;
std::unordered_map<std::string, std::vector<Locus>> results; // std::unordered_map<std::string, std::vector<Locus>> results;
std::vector<std::string> header; // std::vector<std::string> header;
// std::cout << " - Loading subset of loci :" << std::endl; //// std::cout << " - Loading subset of loci :" << std::endl;
// par = "input_file_path"; //// par = "input_file_path";
// std::string input_file_path = parameters.get_value_from_name<std::string>(par); //// std::string input_file_path = parameters.get_value_from_name<std::string>(par);
// std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header); //// std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header);
// std::cout << " - Loading coverage table :" << std::endl; //// std::cout << " - Loading coverage table :" << std::endl;
// std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header); //// std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header);
// par = "max_distance"; //// par = "max_distance";
// int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later //// int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
// std::cout << std::endl << " - Starting alignments" << std::endl; //// std::cout << std::endl << " - Starting alignments" << std::endl;
// auto sequence = sequences.begin(); //// auto sequence = sequences.begin();
// // Create and run all sequence processors //// // Create and run all sequence processors
// par = "n_threads"; //// par = "n_threads";
// std::vector<std::thread> threads; //// std::vector<std::thread> threads;
// for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) { //// for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
// threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance)); //// threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
// } //// }
// for (auto &t : threads) t.join();
// par = "output_file_path";
// std::string output_file_path = parameters.get_value_from_name<std::string>(par);
// output_group_loci(output_file_path, results, header);
}
//// for (auto &t : threads) t.join();
//// par = "output_file_path";
//// std::string output_file_path = parameters.get_value_from_name<std::string>(par);
//// output_group_loci(output_file_path, results, header);
//}
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom) {
/* Load a coverage matrix in memory
*/
std::ifstream input_file(file_path); //std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom) {
std::vector<Locus> coverage_matrix;
if (input_file) { // /* Load a coverage matrix in memory
// */
// First line is the header. The header is used when printing the output. // std::ifstream input_file(file_path);
std::string temp; // std::vector<Locus> coverage_matrix;
std::getline(input_file, temp);
header = split(temp,"\t");
temp = "";
// Define variables used to read the file // if (input_file) {
char buffer[65536];
uint k = 0, field_n = 0, seq_count = 0;
Locus locus;
bool keep_locus = false;
do { // // First line is the header. The header is used when printing the output.
// std::string temp;
// std::getline(input_file, temp);
// header = split(temp,"\t");
// temp = "";
// Read a chunk of size given by the buffer // // Define variables used to read the file
input_file.read(buffer, sizeof(buffer)); // char buffer[65536];
k = static_cast<uint>(input_file.gcount()); // uint k = 0, field_n = 0, seq_count = 0;
// Locus locus;
// bool keep_locus = false;
for (uint i=0; i<k; ++i) { // do {
// Read the buffer character by character // // Read a chunk of size given by the buffer
switch(buffer[i]) { // input_file.read(buffer, sizeof(buffer));
// k = static_cast<uint>(input_file.gcount());
case '\t': // New field // for (uint i=0; i<k; ++i) {
switch(field_n) { // // Read the buffer character by character
case 0: // switch(buffer[i]) {
locus.id = temp;
break;
case 1:
locus.sequence =temp;
break;
default:
locus.coverage.push_back(temp);
break;
}
temp = ""; // case '\t': // New field
++field_n;
break;
case '\n': // New line (also a new field) // switch(field_n) {
// case 0:
// locus.id = temp;
// break;
// case 1:
// locus.sequence =temp;
// break;
// default:
// locus.coverage.push_back(temp);
// break;
// }
locus.coverage.push_back(temp); // temp = "";
keep_locus = false; // ++field_n;
// break;
for (auto c: locus.coverage) { // case '\n': // New line (also a new field)
if (std::stoi(c) > min_cov) {
keep_locus = true;
break;
}
}
// locus.coverage.push_back(temp);
// keep_locus = false;
// for (auto c: locus.coverage) {
// if (std::stoi(c) > min_cov) {
// keep_locus = true;
// break;
// }
// }
if (keep_locus) coverage_matrix.push_back(locus); // if (keep_locus) coverage_matrix.push_back(locus);
if (seq_count % 1000000 == 0 and seq_count > 0 and print) std::cout << " - Processed " << seq_count / 1000000 << // if (seq_count % 1000000 == 0 and seq_count > 0 and print) std::cout << " - Processed " << seq_count / 1000000 <<
" M. lines and stored " << coverage_matrix.size() << " sequences\n"; // " M. lines and stored " << coverage_matrix.size() << " sequences\n";
// Reset variables // // Reset variables
locus.id = ""; // locus.id = "";
locus.sequence = ""; // locus.sequence = "";
locus.coverage.resize(0); // locus.coverage.resize(0);
temp = ""; // temp = "";
field_n = 0; // field_n = 0;
++seq_count; // ++seq_count;
break; // break;
default: // default:
temp += buffer[i]; // temp += buffer[i];
break; // break;
} // }
} // }
} while (input_file); // } while (input_file);
} // }
return coverage_matrix; // return coverage_matrix;
} //}
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix, //void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) { // std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
while(sequence != sequences.end()) { // while(sequence != sequences.end()) {
results_mutex.lock(); // results_mutex.lock();
Locus locus = *sequence; // Locus locus = *sequence;
++sequence; // ++sequence;
results_mutex.unlock(); // results_mutex.unlock();
process_sequence(locus, coverage_matrix, results, results_mutex, max_distance); // process_sequence(locus, coverage_matrix, results, results_mutex, max_distance);
} // }
} //}
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) { //void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
uint seq_len = static_cast<uint>(locus.sequence.size()); // uint seq_len = static_cast<uint>(locus.sequence.size());
std::unordered_map<std::string, std::vector<Locus>> temp_results; // std::unordered_map<std::string, std::vector<Locus>> temp_results;
const char* seq = locus.sequence.c_str(); // const char* seq = locus.sequence.c_str();
EdlibAlignResult alignment_result; // EdlibAlignResult alignment_result;
for (auto l: coverage_matrix) { // for (auto l: coverage_matrix) {
alignment_result = edlibAlign(seq, seq_len, l.sequence.c_str(), seq_len, edlibDefaultAlignConfig()); // alignment_result = edlibAlign(seq, seq_len, l.sequence.c_str(), seq_len, edlibDefaultAlignConfig());
if (alignment_result.status == EDLIB_STATUS_OK and alignment_result.editDistance < max_distance) { // if (alignment_result.status == EDLIB_STATUS_OK and alignment_result.editDistance < max_distance) {
temp_results[locus.id].push_back(l); // temp_results[locus.id].push_back(l);
} // }
edlibFreeAlignResult(alignment_result); // edlibFreeAlignResult(alignment_result);
} // }
results_mutex.lock(); // results_mutex.lock();
for (auto l: temp_results) results[l.first] = l.second; // for (auto l: temp_results) results[l.first] = l.second;
results_mutex.unlock(); // results_mutex.unlock();
} //}
void filter(std::unordered_map<std::string, std::vector<Locus>>& results) { //void filter(std::unordered_map<std::string, std::vector<Locus>>& results) {
} //}
#pragma once #pragma once
#include <fstream> #include <fstream>
#include <mutex> #include <mutex>
#include <numeric>
#include <string> #include <string>
#include <vector> #include <vector>
#include <thread> #include <thread>
...@@ -13,13 +14,3 @@ ...@@ -13,13 +14,3 @@
// Main function implementing the analysis // Main function implementing the analysis
void loci(Parameters& parameters); void loci(Parameters& parameters);
// Load the coverage matrix in memory
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_depth, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom);
// Pick the next sequence from the sequences vector and process it
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
// Process a sequence (look for similar sequences)
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
#include "loci.h"
void loci(Parameters& parameters) {
std::ofstream output_file;
output_file.open(parameters.output_file_path);
// Load popmap
std::unordered_map<std::string, bool> popmap = load_popmap(parameters);
uint n_indiv_total[2] {0, 0};
for (auto i: popmap) ++n_indiv_total[i.second];
uint min_indiv[2] {uint(n_indiv_total[0] * parameters.loci_min_individual_frequency), uint(n_indiv_total[1] * parameters.loci_min_individual_frequency)};
// Load individual depths from depth file
std::vector<std::string> header;
std::unordered_map<std::string, float> individual_depths_table;
std::ifstream depths_file;
depths_file.open(parameters.depths_file_path);
std::string temp = "";
std::getline(depths_file, temp); // Header line not used in this analysis
if (depths_file) {
while (std::getline(depths_file, temp)) {
header = split(temp, "\t");
if (header.size() == 4) individual_depths_table[header[0]] = std::stof(header[2]);
}
}
float sum = 0;
for (auto i: individual_depths_table) sum += i.second;