Commit 6b07685c authored by RomainFeron's avatar RomainFeron
Browse files

Moved distrib output to distrib cpp file, removed output.* files that are now...

Moved distrib output to distrib cpp file, removed output.* files that are now empty. Implemented computing bias in distrib
parent 91c28d79
......@@ -6,7 +6,6 @@
#include <unordered_map>
#include <vector>
#include "depth_table.h"
#include "output.h"
#include "parameters.h"
#include "popmap.h"
#include "utils.h"
......
......@@ -22,46 +22,59 @@ void distrib(Parameters& parameters) {
bool parsing_ended = false;
MarkersQueue markers_queue;
std::mutex queue_mutex;
uint64_t n_markers;
std::thread parsing_thread(table_parser, std::ref(parameters), std::ref(popmap), std::ref(markers_queue), std::ref(queue_mutex), std::ref(parsing_ended), true, false);
std::thread processing_thread(processor, std::ref(markers_queue), std::ref(parameters), std::ref(queue_mutex), std::ref(results), std::ref(parsing_ended), BATCH_SIZE);
std::thread processing_thread(processor, std::ref(markers_queue), std::ref(parameters), std::ref(queue_mutex), std::ref(results), std::ref(n_markers), std::ref(parsing_ended), BATCH_SIZE);
parsing_thread.join();
processing_thread.join();
// Calculate p-values for association with sex for each combination of males and females
double chi_squared = 0;
// Compute p-values
for (uint f=0; f <= popmap.counts[parameters.group1]; ++f) {
for (uint m=0; m <= popmap.counts[parameters.group2]; ++m) {
if (f + m != 0) {
chi_squared = get_chi_squared(f, m, popmap.counts[parameters.group1], popmap.counts[parameters.group2]);
results[f][m].second = std::min(1.0, get_chi_squared_p(chi_squared)); // p-value corrected with Bonferroni, with max of 1
for (uint g = 0; g <= popmap.counts[parameters.group1]; ++g) {
for (uint h = 0; h <= popmap.counts[parameters.group2]; ++h) {
if (g + h != 0) {
chi_squared = get_chi_squared(g, h, popmap.counts[parameters.group1], popmap.counts[parameters.group2]);
results[g][h].second = std::min(1.0, get_chi_squared_p(chi_squared)); // p-value corrected with Bonferroni, with max of 1
}
}
}
// Generate the output file
if (!parameters.output_matrix) {
std::ofstream output_file(parameters.output_file_path);
// Output file header
output_file << parameters.group1 << "\t" << parameters.group2 << "\t" << "Markers" << "\t" << "P" << "\t" << "Signif" << "\t" << "Bias" << "\n";
output_distrib(parameters.output_file_path, results, popmap.counts[parameters.group1], popmap.counts[parameters.group2], parameters.group1, parameters.group2,
parameters.signif_threshold, parameters.disable_correction);
if (not parameters.disable_correction) parameters.signif_threshold /= n_markers; // Bonferroni correction: divide threshold by number of tests
} else {
// Generate output file
for (uint g = 0; g <= popmap.counts[parameters.group1]; ++g) {
output_distrib_matrix(parameters.output_file_path, results, popmap.counts[parameters.group1], popmap.counts[parameters.group2]);
for (uint h = 0; h <= popmap.counts[parameters.group2]; ++h) {
if (g + h != 0) {
output_file << g << "\t" << h << "\t" << results[g][h].first << "\t" << results[g][h].second << "\t"
<< (static_cast<float>(results[g][h].second) < parameters.signif_threshold ? "True" : "False") << "\t"
<< static_cast<float>(g) / static_cast<float>(popmap.counts[parameters.group1]) - (static_cast<float>(h) / static_cast<float>(popmap.counts[parameters.group2])) << "\n";
}
}
}
log("RADSex distrib ended (total runtime: " + get_runtime(t_begin) + ")");
}
void processor(MarkersQueue& markers_queue, Parameters& parameters, std::mutex& queue_mutex, sd_table& results, bool& parsing_ended, ulong batch_size) {
void processor(MarkersQueue& markers_queue, Parameters& parameters, std::mutex& queue_mutex, sd_table& results, uint64_t& n_markers, bool& parsing_ended, ulong batch_size) {
// Give 100ms headstart to table parser thread (to get number of individuals from header)
std::this_thread::sleep_for(std::chrono::milliseconds(100));
......@@ -81,7 +94,13 @@ void processor(MarkersQueue& markers_queue, Parameters& parameters, std::mutex&
for (auto marker: batch) {
++results[marker.groups[parameters.group1]][marker.groups[parameters.group2]].first;
if (marker.n_individuals > 0) {
++results[marker.groups[parameters.group1]][marker.groups[parameters.group2]].first;
++n_markers;
}
log_progress_bar(n_processed_markers, marker_processed_tick);
}
......
......@@ -4,7 +4,7 @@
#include <unordered_map>
#include <vector>
#include "depth_table.h"
#include "output.h"
#include "stats.h"
#include "parameters.h"
#include "popmap.h"
#include "utils.h"
......@@ -12,4 +12,4 @@
// Compute the distribution of markers between males and females
void distrib(Parameters& parameters);
void processor(MarkersQueue& markers_queue, Parameters& parameters, std::mutex& queue_mutex, sd_table& results, bool& parsing_ended, ulong batch_size);
void processor(MarkersQueue& markers_queue, Parameters& parameters, std::mutex& queue_mutex, sd_table& results, uint64_t& n_markers, bool& parsing_ended, ulong batch_size);
......@@ -4,7 +4,6 @@
#include <unordered_map>
#include <vector>
#include "depth_table.h"
#include "output.h"
#include "parameters.h"
#include "popmap.h"
#include "utils.h"
......
......@@ -6,7 +6,6 @@
#include "bwa/bwamem.h"
#include "bwa/kseq.h"
#include "depth_table.h"
#include "output.h"
#include "parameters.h"
#include "popmap.h"
#include "stats.h"
......
#include "output.h"
void output_distrib_matrix(std::string& output_file_path, sd_table& results, uint n_group1, uint n_group2) {
/* Input:
* - Path to an output file
* - A matrix of counts [Males: [Females: Count]]
* Output:
* - A matrix of counts (males in columns and females in rows)
*/
std::ofstream output_file;
output_file.open(output_file_path);
results[0][0].first = 0; // Sequences found in none of the individuals (after filtering for minimum coverage) should not appear in the results
// Generate output
for (uint f=0; f <= n_group1; ++f) {
for (uint m=0; m <= n_group2; ++m) {
output_file << results[f][m].first;
(m < n_group1) ? output_file << "\t" : output_file << "\n";
}
}
}
void output_distrib(std::string& output_file_path, sd_table& results, uint n_group1, uint n_group2, std::string& group1, std::string& group2,
float signif_threshold, bool disable_correction) {
/* Input:
* - Path to an output file
* - A matrix of counts [Males: [Females: Count]]
* Output: a table with the following structure:
* Number of males | Number of females | Number of markers | P-value | Significance
* <int> | <int> | <int> | <float> | <bool>
*/
std::ofstream output_file;
output_file.open(output_file_path);
// Output file header
output_file << group1 << "\t" << group2 << "\t" << "Markers" << "\t" << "P" << "\t" << "Signif" << "\n";
uint n_markers = 0;
// Determine the total number of markers
for (uint f=0; f <= n_group1; ++f) {
for (uint m=0; m <= n_group2; ++m) {
if (f + m != 0) n_markers += results[f][m].first; // Exclude markers present in 0 individuals
}
}
if (not disable_correction) signif_threshold /= n_markers;
// Generate output file
for (uint m=0; m <= n_group1; ++m) {
for (uint f=0; f <= n_group2; ++f) {
if (f + m != 0) {
output_file << m << "\t" << f << "\t" << results[m][f].first << "\t" << results[m][f].second << "\t" <<
(static_cast<float>(results[m][f].second) < signif_threshold ? "True" : "False") << "\n";
}
}
}
}
#pragma once
#include <cstdio>
#include <fstream>
#include <string>
#include <unordered_map>
#include <vector>
#include "stats.h"
#include "utils.h"
// Generate an output file for "distrib" in matrix format
void output_distrib_matrix(std::string& output_file_path, sd_table& results, uint n_group1, uint n_group2);
// Generate an output file for "distrib" in table format
void output_distrib(std::string& output_file_path, sd_table& results, uint n_group1, uint n_group2, std::string& group1, std::string& group2, float signif_threshold, bool disable_correction);
......@@ -7,7 +7,6 @@
#include <unordered_map>
#include <zlib.h>
#include "kseq/kseq.h"
#include "output.h"
#include "parameters.h"
#include "utils.h"
......
......@@ -4,7 +4,6 @@
#include <unordered_map>
#include <vector>
#include "depth_table.h"
#include "output.h"
#include "parameters.h"
#include "popmap.h"
#include "stats.h"
......
......@@ -4,9 +4,9 @@
#include <unordered_map>
#include <vector>
#include "depth_table.h"
#include "output.h"
#include "parameters.h"
#include "popmap.h"
#include "stats.h"
#include "utils.h"
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment