Commit 7729fcb7 authored by Romain Feron's avatar Romain Feron
Browse files

Implemented locus grouping analysis

parent 0c13bdac
......@@ -18,7 +18,8 @@ HEADERS += \
src/subset.h \
src/frequencies.h \
src/demultiplexing.h \
src/barcodes_file.h
src/barcodes_file.h \
src/group_loci.h
SOURCES += \
src/main.cpp \
......@@ -32,4 +33,5 @@ SOURCES += \
src/subset.cpp \
src/frequencies.cpp \
src/demultiplexing.cpp \
src/barcodes_file.cpp
src/barcodes_file.cpp \
src/group_loci.cpp
#include "group_loci.h"
void group_loci(Parameters& parameters) {
std::mutex results_mutex;
std::unordered_map<std::string, std::vector<Locus>> results;
std::string par = "coverage_matrix_path";
std::string coverage_matrix_path = parameters.get_value_from_name<std::string>(par);
par = "min_cov";
int min_cov = parameters.get_value_from_name<int>(par) - 1; // -1 to compare with > instead of >= later
std::cout << " - Loading coverage matrix :\n" << std::endl;
std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true);
par = "input_file_path";
std::string input_file_path = parameters.get_value_from_name<std::string>(par);
std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false);
par = "max_distance";
int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
std::cout << std::endl << " - Starting alignments" << std::endl;
auto sequence = sequences.begin();
// Create and run all sequence processors
par = "n_threads";
std::vector<std::thread> threads;
for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
}
for (auto &t : threads) t.join();
par = "output_file_path";
std::string output_file_path = parameters.get_value_from_name<std::string>(par);
output_group_loci(output_file_path, results);
}
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print) {
/* Load a coverage matrix in memory
*/
std::ifstream input_file(file_path);
std::vector<Locus> coverage_matrix;
if (input_file) {
// First line is the header. The header is not used here.
std::string temp;
std::getline(input_file, temp);
temp = "";
// Define variables used to read the file
char buffer[65536];
uint k = 0, field_n = 0, seq_count = 0;
Locus locus;
bool keep_locus = false;
do {
// Read a chunk of size given by the buffer
input_file.read(buffer, sizeof(buffer));
k = input_file.gcount();
for (uint i=0; i<k; ++i) {
// Read the buffer character by character
switch(buffer[i]) {
case '\r':
break;
case '\t': // New field
switch(field_n) {
case 0:
locus.id = temp;
break;
case 1:
locus.sequence =temp;
break;
default:
locus.coverage.push_back(temp);
break;
}
temp = "";
++field_n;
break;
case '\n': // New line (also a new field)
locus.coverage.push_back(temp);
keep_locus = false;
for (auto c: locus.coverage) {
if (std::stoi(c) > min_cov) {
keep_locus = true;
break;
}
}
if (keep_locus) coverage_matrix.push_back(locus);
if (seq_count % 1000000 == 0 and print) std::cout << "\t- Processed " << seq_count / 1000000 << " M. lines and stored " << coverage_matrix.size() << " sequences\n";
// Reset variables
locus.id = "";
locus.sequence = "";
locus.coverage.resize(0);
temp = "";
field_n = 0;
++seq_count;
break;
default:
temp += buffer[i];
break;
}
}
} while (input_file);
}
return coverage_matrix;
}
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
while(sequence != sequences.end()) {
results_mutex.lock();
Locus locus = *sequence;
++sequence;
results_mutex.unlock();
process_sequence(locus, coverage_matrix, results, results_mutex, max_distance);
}
}
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
uint seq_len = locus.sequence.size();
std::unordered_map<std::string, std::vector<Locus>> temp_results;
const char* seq = locus.sequence.c_str();
EdlibAlignResult alignment_result;
for (auto l: coverage_matrix) {
alignment_result = edlibAlign(seq, seq_len, l.sequence.c_str(), seq_len, edlibDefaultAlignConfig());
if (alignment_result.status == EDLIB_STATUS_OK and alignment_result.editDistance < max_distance) {
temp_results[locus.id].push_back(l);
}
edlibFreeAlignResult(alignment_result);
}
results_mutex.lock();
for (auto l: temp_results) results[l.first] = l.second;
results_mutex.unlock();
}
#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <thread>
#include <mutex>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "output.h"
#include "edlib/edlib.h"
// Main function implementing the analysis
void group_loci(Parameters& parameters);
// Load the coverage matrix in memory
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print);
// Pick the next sequence from the sequences vector and process it
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
// Process a sequence (look for similar sequences)
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance);
#include "output.h"
#include <iostream>
void output_process_reads(std::string& output_file_path, std::vector<std::string>& individuals, std::unordered_map<std::string, std::unordered_map<std::string, uint16_t>>& results) {
/* Input:
......@@ -60,3 +61,53 @@ void output_sex_distribution(std::string& output_file_path, std::unordered_map<u
i=0;
}
}
void output_group_loci(std::string& output_file_path, std::unordered_map<std::string, std::vector<Locus>>& results) {
/* Input:
* - Path to an output file
* - A matrix of loci [Sequence ID: [Associated sequences]]
* Output:
* - A table with following columns:
* Locus ID | Sequence ID | Sequence Status | Sequence | Cov Ind. 1 | Cov Ind. 2 ...
*/
std::ofstream output_file;
output_file.open(output_file_path);
std::string seq_id;
uint locus_id = 0;
for (auto sequence: results) {
seq_id = sequence.first;
// First iteration to output the original sequence
for (auto locus: sequence.second) {
if (locus.id == seq_id) {
output_file << locus_id << "\t" << locus.id << "\t" << locus.sequence << "\t";
if (seq_id == locus.id) output_file << "Original" << "\t"; else output_file << "Recovered" << "\t";
for (uint i=0; i<locus.coverage.size(); ++i) {
output_file << locus.coverage[i];
if (i < locus.coverage.size() - 1) output_file << "\t";
}
output_file << "\n";
}
}
// Second iteration to output the other sequences (this is not optimized but this step is not intensive anyway)
for (auto locus: sequence.second) {
if (locus.id != seq_id) {
output_file << locus_id << "\t" << locus.id << "\t" << locus.sequence << "\t";
if (seq_id == locus.id) output_file << "Original" << "\t"; else output_file << "Recovered" << "\t";
for (uint i=0; i<locus.coverage.size(); ++i) {
output_file << locus.coverage[i];
if (i < locus.coverage.size() - 1) output_file << "\t";
}
output_file << "\n";
}
}
++locus_id;
}
}
......@@ -3,10 +3,13 @@
#include <vector>
#include <unordered_map>
#include <fstream>
#include "utils.h"
// Create output file for the process reads analysis
void output_process_reads(std::string& output_file_path, std::vector<std::string>& individuals, std::unordered_map<std::string, std::unordered_map<std::string, uint16_t>>& results);
// Create output file for the sex_distribution analysis
void output_sex_distribution(std::string& output_file_path, std::unordered_map<uint, std::unordered_map<uint, uint>>& results);
// Create output file for the group_loci analysis
void output_group_loci(std::string& output_file_path, std::unordered_map<std::string, std::vector<Locus>>& results);
......@@ -14,13 +14,16 @@ struct Parameters {
// Initialize all possible parameters to be used in the analyses
// Arguments: name, help message, flag, default value, type, internal type, value, required
// Flags : -h, -f, -d, -o, -u, -t, -c, -p, -b, -a, -m, --min-males, --min-females, --max-males, --max-females
std::vector<Parameter> list {Parameter("help", "Prints this message", "-h", "0", "bool", "bool", "", false),
Parameter("input_file_path", "Path to an input file", "-f", "", "string", "ifile", "", true),
Parameter("input_dir_path", "Path to an input directory", "-d", "", "string", "dir", "", true),
Parameter("output_file_path", "Path to an output file", "-o", "", "string", "ofile", "", true),
Parameter("output_dir_path", "Path to an output directory", "-u", "", "string", "dir", "", true),
Parameter("coverage_matrix_path", "Path to an coverage matrix file", "-a", "", "string", "ifile", "", true),
Parameter("n_threads", "Number of threads", "-t", "1", "int", "int", "", false),
Parameter("min_cov", "Minimum coverage to consider a marker", "-c", "1", "int", "int", "", false),
Parameter("max_distance", "Maximum distance between two sequences in a locus", "-m", "1", "int", "int", "", false),
Parameter("popmap_file_path", "Path to a popmap file", "-p", "", "string", "ifile", "", true),
Parameter("barcodes_file_path", "Path to a barcodes file", "-b", "", "string", "ifile", "", true),
Parameter("min_males", "Minimum number of males in the subset", "--min-males", "0", "int", "int", "", false),
......
......@@ -7,6 +7,7 @@
#include "subset.h"
#include "frequencies.h"
#include "demultiplexing.h"
#include "group_loci.h"
class RadSex {
......@@ -38,6 +39,10 @@ class RadSex {
// {"demultiplexing", Analysis("demultiplexing", "Demultiplexes a set of reads files",
// std::vector<std::string> {"input_file_path", "output_dir_path", "barcodes_file_path", "min_cov"},
// demultiplexing)},
{"group_loci", Analysis("group_loci", "Recreate polymorphic loci from a subset of coverage matrix",
std::vector<std::string> {"input_file_path", "coverage_matrix_path", "output_file_path",
"max_distance", "n_threads", "min_cov"},
group_loci)},
};
// In the constructor, the type of analysis is detected and all analysis objects are initialized
......
......@@ -12,6 +12,11 @@
#define DTTMFMT "%Y-%m-%d %H:%M:%S"
#define DTTMSZ 21
struct Locus {
std::string id;
std::string sequence;
std::vector<std::string> coverage;
};
// Output current date and time in format specified with DMTTMFMT and DTTMSZ
char* print_time (char *buff);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment