Commit 78ed159e authored by Romain Feron's avatar Romain Feron
Browse files

Implemented extraction of sequences significantly associated with sex - debugging on genotoul

parent f4b39ca8
#include <math.h>
#include "kfun.h"
/* Log gamma function
* \log{\Gamma(z)}
* AS245, 2nd algorithm, http://lib.stat.cmu.edu/apstat/245
*/
double kf_lgamma(double z)
{
double x = 0;
x += 0.1659470187408462e-06 / (z+7);
x += 0.9934937113930748e-05 / (z+6);
x -= 0.1385710331296526 / (z+5);
x += 12.50734324009056 / (z+4);
x -= 176.6150291498386 / (z+3);
x += 771.3234287757674 / (z+2);
x -= 1259.139216722289 / (z+1);
x += 676.5203681218835 / z;
x += 0.9999999999995183;
return log(x) - 5.58106146679532777 - z + (z-0.5) * log(z+6.5);
}
/* complementary error function
* \frac{2}{\sqrt{\pi}} \int_x^{\infty} e^{-t^2} dt
* AS66, 2nd algorithm, http://lib.stat.cmu.edu/apstat/66
*/
double kf_erfc(double x)
{
const double p0 = 220.2068679123761;
const double p1 = 221.2135961699311;
const double p2 = 112.0792914978709;
const double p3 = 33.912866078383;
const double p4 = 6.37396220353165;
const double p5 = .7003830644436881;
const double p6 = .03526249659989109;
const double q0 = 440.4137358247522;
const double q1 = 793.8265125199484;
const double q2 = 637.3336333788311;
const double q3 = 296.5642487796737;
const double q4 = 86.78073220294608;
const double q5 = 16.06417757920695;
const double q6 = 1.755667163182642;
const double q7 = .08838834764831844;
double expntl, z, p;
z = fabs(x) * M_SQRT2;
if (z > 37.) return x > 0.? 0. : 2.;
expntl = exp(z * z * - .5);
if (z < 10. / M_SQRT2) // for small z
p = expntl * ((((((p6 * z + p5) * z + p4) * z + p3) * z + p2) * z + p1) * z + p0)
/ (((((((q7 * z + q6) * z + q5) * z + q4) * z + q3) * z + q2) * z + q1) * z + q0);
else p = expntl / 2.506628274631001 / (z + 1. / (z + 2. / (z + 3. / (z + 4. / (z + .65)))));
return x > 0.? 2. * p : 2. * (1. - p);
}
/* The following computes regularized incomplete gamma functions.
* Formulas are taken from Wiki, with additional input from Numerical
* Recipes in C (for modified Lentz's algorithm) and AS245
* (http://lib.stat.cmu.edu/apstat/245).
*
* A good online calculator is available at:
*
* http://www.danielsoper.com/statcalc/calc23.aspx
*
* It calculates upper incomplete gamma function, which equals
* kf_gammaq(s,z)*tgamma(s).
*/
#define KF_GAMMA_EPS 1e-14
#define KF_TINY 1e-290
// regularized lower incomplete gamma function, by series expansion
static double _kf_gammap(double s, double z)
{
double sum, x;
int k;
for (k = 1, sum = x = 1.; k < 100; ++k) {
sum += (x *= z / (s + k));
if (x / sum < KF_GAMMA_EPS) break;
}
return exp(s * log(z) - z - kf_lgamma(s + 1.) + log(sum));
}
// regularized upper incomplete gamma function, by continued fraction
static double _kf_gammaq(double s, double z)
{
int j;
double C, D, f;
f = 1. + z - s; C = f; D = 0.;
// Modified Lentz's algorithm for computing continued fraction
// See Numerical Recipes in C, 2nd edition, section 5.2
for (j = 1; j < 100; ++j) {
double a = j * (s - j), b = (j<<1) + 1 + z - s, d;
D = b + a * D;
if (D < KF_TINY) D = KF_TINY;
C = b + a / C;
if (C < KF_TINY) C = KF_TINY;
D = 1. / D;
d = C * D;
f *= d;
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
}
return exp(s * log(z) - z - kf_lgamma(s) - log(f));
}
double kf_gammap(double s, double z)
{
return z <= 1. || z < s? _kf_gammap(s, z) : 1. - _kf_gammaq(s, z);
}
double kf_gammaq(double s, double z)
{
return z <= 1. || z < s? 1. - _kf_gammap(s, z) : _kf_gammaq(s, z);
}
/* Regularized incomplete beta function. The method is taken from
* Numerical Recipe in C, 2nd edition, section 6.4. The following web
* page calculates the incomplete beta function, which equals
* kf_betai(a,b,x) * gamma(a) * gamma(b) / gamma(a+b):
*
* http://www.danielsoper.com/statcalc/calc36.aspx
*/
static double kf_betai_aux(double a, double b, double x)
{
double C, D, f;
int j;
if (x == 0.) return 0.;
if (x == 1.) return 1.;
f = 1.; C = f; D = 0.;
// Modified Lentz's algorithm for computing continued fraction
for (j = 1; j < 200; ++j) {
double aa, d;
int m = j>>1;
aa = (j&1)? -(a + m) * (a + b + m) * x / ((a + 2*m) * (a + 2*m + 1))
: m * (b - m) * x / ((a + 2*m - 1) * (a + 2*m));
D = 1. + aa * D;
if (D < KF_TINY) D = KF_TINY;
C = 1. + aa / C;
if (C < KF_TINY) C = KF_TINY;
D = 1. / D;
d = C * D;
f *= d;
if (fabs(d - 1.) < KF_GAMMA_EPS) break;
}
return exp(kf_lgamma(a+b) - kf_lgamma(a) - kf_lgamma(b) + a * log(x) + b * log(1.-x)) / a / f;
}
double kf_betai(double a, double b, double x)
{
return x < (a + 1.) / (a + b + 2.)? kf_betai_aux(a, b, x) : 1. - kf_betai_aux(b, a, 1. - x);
}
#pragma once
#include <math.h>
double kf_lgamma(double z);
double kf_erfc(double x);
#define KF_GAMMA_EPS 1e-14
#define KF_TINY 1e-290
double kf_gammap(double s, double z);
double kf_gammaq(double s, double z);
double kf_betai(double a, double b, double x);
......@@ -19,7 +19,8 @@ HEADERS += \
src/frequencies.h \
src/demultiplexing.h \
src/barcodes_file.h \
src/group_loci.h
src/group_loci.h \
src/significant_sequences.h
SOURCES += \
src/main.cpp \
......@@ -34,4 +35,5 @@ SOURCES += \
src/frequencies.cpp \
src/demultiplexing.cpp \
src/barcodes_file.cpp \
src/group_loci.cpp
src/group_loci.cpp \
src/significant_sequences.cpp
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE QtCreatorProject>
<!-- Written by QtCreator 4.2.1, 2018-02-16T16:20:42. -->
<!-- Written by QtCreator 4.2.1, 2018-03-05T17:37:14. -->
<qtcreator>
<data>
<variable>EnvironmentId</variable>
......
......@@ -8,6 +8,7 @@
#include "frequencies.h"
#include "demultiplexing.h"
#include "group_loci.h"
#include "significant_sequences.h"
class RadSex {
......@@ -43,6 +44,9 @@ class RadSex {
std::vector<std::string> {"input_file_path", "coverage_matrix_path", "output_file_path",
"max_distance", "n_threads", "min_cov"},
group_loci)},
{"significant_sequences", Analysis("significant_sequences", "Extract sequences significantly associated with sex from the coverage matrix.",
std::vector<std::string> {"input_file_path", "output_file_path", "popmap_file_path", "min_cov"},
significant_sequences)},
};
// In the constructor, the type of analysis is detected and all analysis objects are initialized
......
#include "significant_sequences.h"
void significant_sequences(Parameters& parameters) {
/* The significant_sequences function parses through a file generated by process_reads and outputs sequences significantly associated with sex.
* Association with sex is determined using a Chi-squared test, and p-value is corrected with Bonferroni correction.
* - Found in M males with min_males <= M <= max_males
* - Found in F females with min_females <= F <= max_females
*/
std::unordered_map<std::string, bool> popmap = load_popmap(parameters);
uint total_males = 0, total_females = 0;
for (auto i: popmap) if (i.second) ++total_males; else ++total_females;
std::string par = "input_file_path";
std::ifstream input_file;
input_file.open(parameters.get_value_from_name<std::string>(par));
par = "min_cov";
int min_cov = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=
if (input_file) {
par = "output_file_path";
std::ofstream output_file;
output_file.open(parameters.get_value_from_name<std::string>(par));
// First line is the header. The header is parsed to get the sex of each field in the table.
std::vector<std::string> line;
std::string temp = "";
std::getline(input_file, temp);
output_file << temp << "\n"; // Copy the header line to the subset output file
line = split(temp, "\t");
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
std::unordered_map<uint, uint> sex_columns;
// Detection of individuals is based on the popmap, so individuals without sex should still be in the popmap
for (uint i=0; i<line.size(); ++i) {
if (popmap.find(line[i]) != popmap.end()) {
if (popmap[line[i]]) {
sex_columns[i] = 0; // Male --> column 0
} else {
sex_columns[i] = 1; // Female --> column 1
}
} else {
sex_columns[i] = 2; // First and second columns (id and sequence) are counted as no sex
}
}
// Define variables used to read the file
char buffer[65536];
std::string temp_line;
uint k = 0, field_n = 0, seq_count = 0;
int sex_count[3] = {0, 0, 0}; // Index: 0 = male, 1 = female, 2 = no sex information
double chi_squared = 0, p = 0;
std::map<std::string, double> candidate_sequences;
do {
// Read a chunk of size given by the buffer
input_file.read(buffer, sizeof(buffer));
k = input_file.gcount();
for (uint i=0; i<k; ++i) {
// Read the buffer character by character
switch(buffer[i]) {
case '\r':
break;
case '\t': // New field
if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]]; // Increment the appropriate counter
temp = "";
temp_line += buffer[i];
++field_n;
break;
case '\n': // New line (also a new field)
if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]]; // Increment the appropriate counter
if (sex_count[0] + sex_count[1] > 0) {
++seq_count;
chi_squared = get_chi_squared(sex_count[0], sex_count[1], total_males, total_females);
p = get_chi_squared_p(chi_squared);
if (p < 0.05) { // First pass: we filter sequences with at least one male or one female and non-corrected p < 0.05
candidate_sequences[temp_line] = p;
}
}
// Reset variables
temp = "";
temp_line = "";
field_n = 0;
sex_count[0] = 0;
sex_count[1] = 0;
break;
default:
temp += buffer[i];
temp_line += buffer[i];
break;
}
}
} while(input_file);
double significance_threshold = 0.05 / seq_count; // Bonferroni correction: divide threshold by number of tests
// Second pass: filter with bonferroni
for (auto sequence: candidate_sequences) {
if (sequence.second < significance_threshold) {
std::cout << sequence.second << "\t" << significance_threshold << "\n";
output_file << sequence.first << "\n";
}
}
output_file.close();
input_file.close();
}
}
double get_chi_squared_p(double chi_squared) {
/* p is given by 1 - P(chi_squared, df) where P is the Cumulative Distribution Function of the Chi-squared distribution.
* P is also the regularized gamma function. Here we use samtool's implementation of the regularized gamma function by Hen Li.
* Source: https://en.wikipedia.org/wiki/Chi-squared_distribution#Cumulative_distribution_function
* DF is always 1 in our case
*/
return 1 - kf_gammap(0.5, chi_squared/2);
}
double get_chi_squared(uint n_males, uint n_females, uint total_males, uint total_females) {
/* Chi squared is computed from the number of males and females with the sequence, as well as total number of males and females in the population.
* Yates correction is applied and the shortcut formula for 2x2 table is used.
* Source: https://en.wikipedia.org/wiki/Yates%27s_correction_for_continuity
*/
uint N = total_males + total_females;
uint Ns = total_males, Nf = total_females;
uint Na = n_males + n_females, Nb = total_males + total_females - n_males - n_females;
int temp = (n_males * total_females) - (n_females * total_males);
temp = std::abs(temp);
double temp2 = std::max(0.0, double(temp) - N/2);
temp2 *= temp2;
return N * temp2 / Ns / Nf / Na / Nb;
}
#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "output.h"
#include "kfun/kfun.h"
// Main function implementing the analysis
void significant_sequences(Parameters& parameters);
// Compute Chi-square from the number of males and females with a sequence as well as total males and total females
double get_chi_squared(uint n_males, uint n_females, uint total_males, uint total_females);
// Compute the p-value for a Chi-square
double get_chi_squared_p(double chi_squared);
......@@ -8,6 +8,7 @@
#include <cmath>
#include <algorithm>
#include <unordered_map>
#include <map>
#define DTTMFMT "%Y-%m-%d %H:%M:%S"
#define DTTMSZ 21
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment