Commit f5609b8e authored by Romain Feron's avatar Romain Feron
Browse files

Updated sex distribution and mapping output to add signif field

Moved mapping output to output module for clarity. Still need some doc
parent ba4c2f01
......@@ -30,17 +30,6 @@ void mapping(Parameters& parameters) {
exit(1);
}
// Output file
par = "output_file_path";
std::ofstream output_file;
output_file.open(parameters.get_value_from_name<std::string>(par));
if (not output_file) {
exit(1);
}
output_file << "Sequence" << "\t" << "Contig" << "\t" << "Position" << "\t" << "SexBias" << "\t" << "P" << "\n";
// Genome file path
par = "genome_file_path";
std::string genome_file_path = parameters.get_value_from_name<std::string>(par);
......@@ -125,8 +114,10 @@ void mapping(Parameters& parameters) {
mem_alnreg_v ar;
mem_aln_t best;
uint j;
double chi_squared, p, sex_bias;
double chi_squared;
int best_alignment[3] {0, -1, 0}; // Index, score, count
MappedSequence seq;
std::vector<MappedSequence> sequences;
std::cout << " - Mapping the sequences ..." << std::endl;
......@@ -163,18 +154,21 @@ void mapping(Parameters& parameters) {
}
best = mem_reg2aln(opt, index->bns, index->pac, sequence_length, sequence.c_str(), &ar.a[best_alignment[0]]); // Get mapping quality
if (best_alignment[2] < 1 and best.mapq > min_quality) { // Keep sequences with unique best alignment and with mapq > minimum quality
sex_bias = float(sex_count[0]) / float(n_males_total) - float(sex_count[1]) / float(n_females_total); // Sex bias. There should never be 0 males or females in the entire population.
seq.sex_bias = float(sex_count[0]) / float(n_males_total) - float(sex_count[1]) / float(n_females_total); // Sex bias. There should never be 0 males or females in the entire population.
chi_squared = get_chi_squared(sex_count[0], sex_count[1], n_males_total, n_females_total);
(chi_squared == chi_squared) ? p = get_chi_squared_p(chi_squared) : p = 1.0; // chi square is NaN --> sequence found in all individuals --> set p to 1
p < 0.0000000000000001 ? p = 0.0000000000000001 : p = p;
output_file << id << "\t" << index->bns->anns[best.rid].name << "\t" << best.pos << "\t" << sex_bias << "\t" << p << "\n";
(chi_squared == chi_squared) ? seq.p = get_chi_squared_p(chi_squared) : seq.p = 1.0; // chi square is NaN --> sequence found in all individuals --> set p to 1
seq.p < 0.0000000000000001 ? seq.p = 0.0000000000000001 : seq.p = seq.p;
seq.id = id;
seq.contig = index->bns->anns[best.rid].name;
seq.position = best.pos;
sequences.push_back(seq);
++retained_sequences;
}
free(best.cigar); // Deallocate cigar string for best hit
free(ar.a); // Deallocate the hit list
}
++total_n_sequences;
if (total_n_sequences % 10000 == 0 and total_n_sequences / 10000 != 0) std::cout << " > Processed " << total_n_sequences / 1000 << " K. sequences and retained "
if (total_n_sequences % 100000 == 0 and total_n_sequences / 100000 != 0) std::cout << " > Processed " << total_n_sequences / 1000 << " K. sequences and retained "
<< retained_sequences / 1000 << " K. sequences." << std::endl;
// Reset variables
best_alignment[0] = 0;
......@@ -200,7 +194,12 @@ void mapping(Parameters& parameters) {
}
} while (input_file);
output_file.close();
par = "output_file_path";
std::string output_file_path = parameters.get_value_from_name<std::string>(par);
// Generate the output file
output_mapping(output_file_path, sequences);
input_file.close();
free(opt);
bwa_idx_destroy(index);
......
......@@ -8,4 +8,5 @@
#include "bwa/bwamem.h"
#include "bwa/kseq.h"
void mapping(Parameters& parameters);
......@@ -60,10 +60,11 @@ void output_sex_distribution_matrix(std::string& output_file_path, sd_table& res
std::ofstream output_file;
output_file.open(output_file_path);
uint i = 0;
results[0][0].first = 0; // Sequences found in none of the individuals (after filtering for minimum coverage) should not appear in the results
uint i = 0;
// Generate output
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
output_file << results[m][f].first;
......@@ -91,13 +92,25 @@ void output_sex_distribution(std::string& output_file_path, sd_table& results, u
output_file.open(output_file_path);
// Output file header
output_file << "Males" << "\t" << "Females" << "\t" << "Sequences" << "\t" << "P" << "\n";
output_file << "Males" << "\t" << "Females" << "\t" << "Sequences" << "\t" << "P" << "\t" << "Signif" << "\n";
uint n_sequences = 0;
double threshold = 0.05;
// Determine the total number of sequences
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) n_sequences += results[m][f].first; // Exclude sequences present in 0 individuals
}
}
threshold /= n_sequences;
// Generate output file
for (uint m=0; m < n_males; ++m) {
for (uint f=0; f < n_females; ++f) {
if (f + m != 0) {
output_file << m << "\t" << f << "\t" << results[m][f].first << "\t" << results[m][f].second << "\n";
output_file << m << "\t" << f << "\t" << results[m][f].first << "\t" << results[m][f].second << "\t" << (results[m][f].second < threshold ? "True" : "False") << "\n";
}
}
}
......@@ -159,3 +172,22 @@ void output_group_loci(std::string& output_file_path, std::unordered_map<std::st
++locus_id;
}
}
void output_mapping(std::string& output_file_path, std::vector<MappedSequence> sequences) {
std::ofstream output_file;
output_file.open(output_file_path);
output_file << "Sequence" << "\t" << "Contig" << "\t" << "Position" << "\t" << "SexBias" << "\t" << "P" << "\t" << "Signif" << "\n";
double threshold = 0.05 / sequences.size();
for (auto s: sequences) {
output_file << s.id << "\t" << s.contig << "\t" << s.position << "\t" << s.sex_bias << "\t" << s.p << "\t" <<
(s.p < threshold ? "True" : "False") << "\n";
}
output_file.close();
}
......@@ -18,3 +18,6 @@ void output_sex_distribution(std::string& output_file_path, sd_table& results, u
// Create output file for the group_loci analysis
void output_group_loci(std::string& output_file_path, std::unordered_map<std::string, std::vector<Locus>>& results, std::vector<std::string>& header);
// Create output file for the mapping analysis
void output_mapping(std::string& output_file_path, std::vector<MappedSequence> sequences);
......@@ -91,22 +91,14 @@ void sex_distribution(Parameters& parameters) {
input_file.close();
// Calculate p-values for association with sex for each combination of males and females
uint n_sequences = 0;
double chi_squared = 0;
// First pass to determine the total number of sequences (faster than counter when reading the file)
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) n_sequences += results[m][f].first;
}
}
// Second pass to compute p-values
// Compute p-values
for (uint f=0; f < n_females; ++f) {
for (uint m=0; m < n_males; ++m) {
if (f + m != 0) {
chi_squared = get_chi_squared(m, f, n_males, n_females);
results[m][f].second = std::min(1.0, get_chi_squared_p(chi_squared) * n_sequences); // p-value corrected with Bonferroni, with max of 1
results[m][f].second = std::min(1.0, get_chi_squared_p(chi_squared)); // p-value corrected with Bonferroni, with max of 1
}
}
}
......
......@@ -13,12 +13,23 @@
#define DTTMFMT "%Y-%m-%d %H:%M:%S"
#define DTTMSZ 21
// Store information about a locus for the loci analysis
struct Locus {
std::string id;
std::string sequence;
std::vector<std::string> coverage;
};
// Store information about a mapped sequence for the mapping analysis
struct MappedSequence {
std::string id;
std::string contig;
int position;
double sex_bias;
double p;
};
// Store sex distribution resutls
typedef std::unordered_map<uint, std::unordered_map<uint, std::pair<uint64_t, double>>> sd_table;
// Output current date and time in format specified with DMTTMFMT and DTTMSZ
......@@ -27,4 +38,5 @@ char* print_time (char *buff);
// Split a std::string into a std::vector of std::strings according to a specified delimiter (default: \t)
std::vector<std::string> split(std::string str, const std::string delimiter);
// Reverse complement of a sequence
void rev_comp(const std::string& sequence, std::string& revcomp_sequence, char nuc);
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment