Commit 0c43f907 authored by Romain Feron's avatar Romain Feron
Browse files

Implemented new output for sex distribution table

parent b3c016e8
#include "output.h"
#include <iostream>
void output_process_reads(std::string& output_file_path, std::vector<std::string>& individuals, std::unordered_map<std::string, std::unordered_map<std::string, uint16_t>>& results, uint min_cov) {
......@@ -50,7 +48,7 @@ void output_process_reads(std::string& output_file_path, std::vector<std::string
void output_sex_distribution(std::string& output_file_path, std::unordered_map<uint, std::unordered_map<uint, uint64_t>>& results, uint n_males, uint n_females) {
void output_sex_distribution_matrix(std::string& output_file_path, std::unordered_map<uint, std::unordered_map<uint, uint64_t>>& results, uint n_males, uint n_females) {
/* Input:
* - Path to an output file
......@@ -79,6 +77,34 @@ void output_sex_distribution(std::string& output_file_path, std::unordered_map<u
void output_sex_distribution(std::string& output_file_path, std::unordered_map<uint, std::unordered_map<uint, uint64_t>>& results, uint n_males, uint n_females) {
/* Input:
* - Path to an output file
* - A matrix of counts [Males: [Females: Count]]
* Output: a table with the following structure:
* Number of males | Number of females | Number of sequences | P-value
* <int> | <int> | <int> | <float>
*/
std::ofstream output_file;
output_file.open(output_file_path);
// Output file header
output_file << "Males" << "\t" << "Females" << "\t" << "Sequences" << "\t" << "P-value" << "\t" << "Significant" << "\n";
// Generate output file
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) {
output_file << m << "\t" << f << "\t" << results[m][f].first << "\t" << results[m][f].second << "\n";
}
}
}
}
void output_group_loci(std::string& output_file_path, std::unordered_map<std::string, std::vector<Locus>>& results, std::vector<std::string>& header) {
/* Input:
......
......@@ -5,6 +5,7 @@
#include <fstream>
#include <cstdio>
#include "utils.h"
#include "stats.h"
// Create output file for the process reads analysis
void output_process_reads(std::string& output_file_path, std::vector<std::string>& individuals, std::unordered_map<std::string, std::unordered_map<std::string, uint16_t>>& results, uint min_cov);
......
......@@ -3,9 +3,9 @@
void sex_distribution(Parameters& parameters) {
/* The sex_distribution function parses through a file generated by process_reads and checks for each sequence
* the number of males and females in which the sequence was found. The output is a matrix with number of males
* in columns, number of females in rows (no headers) and the value at column M and row F giving the number of
* sequences found in M males and F females.
* the number of males and females in which the sequence was found. The output is a table with five columns:
* Number of males | Number of females | Number of sequences | P-value | Significant
* <int> | <int> | <int> | <float> | <bool>
*/
std::unordered_map<std::string, bool> popmap = load_popmap(parameters);
......@@ -50,7 +50,7 @@ void sex_distribution(Parameters& parameters) {
// Define variables used to read the file
char buffer[65536];
uint k = 0, field_n = 0;
std::unordered_map<uint, std::unordered_map<uint, uint64_t>> results;
sd_table results;
uint sex_count[3] = {0, 0, 0}; // Index: 0 = male, 1 = female, 2 = no sex
do {
......@@ -73,7 +73,7 @@ void sex_distribution(Parameters& parameters) {
break;
case '\n': // New line (also a new field)
if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]]; // Increment the appropriate counter
++results[sex_count[0]][sex_count[1]]; // Update the results
++results[sex_count[0]][sex_count[1]].first; // Update the results
// Reset variables
temp = "";
field_n = 0;
......@@ -90,6 +90,27 @@ void sex_distribution(Parameters& parameters) {
input_file.close();
// Calculate p-values for association with sex for each combination of males and females
uint n_sequences = 0;
double chi_squared = 0;
// First pass to determine the total number of sequences (faster than counter when reading the file)
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) n_sequences += results[m][f].first;
}
}
// Second pass to compute p-values
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) {
chi_squared = get_chi_squared(m, f, n_males, n_females);
results[m][f].second = get_chi_squared_p(chi_squared) * n_sequences;
}
}
}
par = "output_file_path";
std::string output_file_path = parameters.get_value_from_name<std::string>(par);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment