Commit e3b49cdf authored by Romain Feron's avatar Romain Feron
Browse files

Implemented fasta output for signif and subset

parent 995c5bfc
......@@ -175,6 +175,7 @@ void output_group_loci(std::string& output_file_path, std::unordered_map<std::st
// Create output file for the mapping analysis
void output_mapping(std::string& output_file_path, std::vector<MappedSequence> sequences) {
std::ofstream output_file;
......
......@@ -64,7 +64,10 @@ struct Parameter {
} else if (this->internal_type == "dir") {
DIR* dir = opendir(this->value.c_str());
return (dir != NULL);
}else {
} else if (this->name == "output_format"){
if (this->value == "fasta" or this->value == "table") return true;
return false;
} else {
return true;
}
}
......
......@@ -16,18 +16,18 @@ struct Parameters {
// Arguments: name, help message, flag, default value, type, internal type, value, required
// Flags : -h, -f, -d, -o, -u, -t, -c, -p, -b, -a, -m, -g, --min-males, --min-females, --max-males, --max-females, --min-individuals, --output-matrix, -q,
// Parameter constructor: Parameter(name, description, flag, default_value, type, internal_type, value, required)
std::vector<Parameter> list {Parameter("help", "Prints this message", "-h", "0", "bool", "bool", "", false),
Parameter("input_file_path", "Path to an input file", "-f", "", "string", "ifile", "", true),
Parameter("input_dir_path", "Path to an input directory", "-d", "", "string", "dir", "", true),
Parameter("output_file_path", "Path to an output file", "-o", "", "string", "ofile", "", true),
Parameter("output_dir_path", "Path to an output directory", "-u", "", "string", "dir", "", true),
Parameter("coverage_matrix_path", "Path to an coverage matrix file", "-a", "", "string", "ifile", "", true),
Parameter("n_threads", "Number of threads", "-t", "1", "int", "int", "", false),
Parameter("min_cov", "Minimum coverage to consider a marker", "-c", "1", "int", "int", "", false),
Parameter("max_distance", "Maximum Levenstein distance between two sequences in a locus", "-m", "1", "int", "int", "", false),
Parameter("popmap_file_path", "Path to a popmap file", "-p", "", "string", "ifile", "", true),
Parameter("barcodes_file_path", "Path to a barcodes file", "-b", "", "string", "ifile", "", true),
Parameter("genome_file_path", "Path to a genome file in fasta format", "-g", "", "string", "ifile", "", true),
std::vector<Parameter> list {Parameter("help", "Prints this message", "--help", "0", "bool", "bool", "", false),
Parameter("input_file_path", "Path to an input file", "--input-file", "", "string", "ifile", "", true),
Parameter("input_dir_path", "Path to an input directory", "--input-dir", "", "string", "dir", "", true),
Parameter("output_file_path", "Path to an output file", "--output-file", "", "string", "ofile", "", true),
Parameter("output_dir_path", "Path to an output directory", "--output-dir", "", "string", "dir", "", true),
Parameter("coverage_matrix_path", "Path to an coverage table file", "--coverage-table", "", "string", "ifile", "", true),
Parameter("n_threads", "Number of threads", "--threads", "1", "int", "int", "", false),
Parameter("min_cov", "Minimum coverage to consider a marker", "--min-cov", "1", "int", "int", "", false),
Parameter("max_distance", "Maximum Levenstein distance between two sequences in a locus", "--max-distance", "1", "int", "int", "", false),
Parameter("popmap_file_path", "Path to a popmap file", "--popmap-file", "", "string", "ifile", "", true),
Parameter("barcodes_file_path", "Path to a barcodes file", "--barcodes-file", "", "string", "ifile", "", true),
Parameter("genome_file_path", "Path to a genome file in fasta format", "--genome-file", "", "string", "ifile", "", true),
Parameter("min_males", "Minimum number of males in the subset", "--min-males", "0", "int", "int", "", false),
Parameter("min_females", "Minimum number of females in the subset", "--min-females", "0", "int", "int", "", false),
Parameter("max_males", "Maximum number of males in the subset", "--max-males", "n.males", "int", "int", "", false),
......@@ -35,8 +35,9 @@ struct Parameters {
Parameter("min_individuals", "Minimum number of individuals in the subset (overrides sex parameters)", "--min-individuals", "0", "int", "int", "", false),
Parameter("max_individuals", "Maxmimum number of individuals in the subset (overrides sex parameters)", "--max-individuals", "n.individual", "int", "int", "", false),
Parameter("output_matrix", "Output the sex distribution table as a matrix", "--output-matrix", "0", "bool", "bool", "", false),
Parameter("min_quality", "Minimum mapping quality to keep a mapped read", "-q", "20", "int", "int", "", false),
Parameter("min_frequency", "Minimum frequency of a sequence in at least one sex.", "--min-frequency", "0.25", "float", "float", "", false)
Parameter("min_quality", "Minimum mapping quality to keep a mapped read", "--min-quality", "20", "int", "int", "", false),
Parameter("min_frequency", "Minimum frequency of a sequence in at least one sex", "--min-frequency", "0.25", "float", "float", "", false),
Parameter("output_format", "Output format, either \"table\" or \"fasta\"", "--output-format", "table", "string", "string", "", false)
};
......
......@@ -55,7 +55,8 @@ class RadSex {
"max_males",
"max_females",
"min_individuals",
"max_individuals"},
"max_individuals",
"output_format"},
subset)},
{"freq",
Analysis("freq",
......@@ -80,7 +81,8 @@ class RadSex {
std::vector<std::string> {"input_file_path",
"output_file_path",
"popmap_file_path",
"min_cov"},
"min_cov",
"output_format"},
significant_sequences)},
{"map",
......
......@@ -20,6 +20,9 @@ void significant_sequences(Parameters& parameters) {
par = "min_cov";
int min_cov = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=
par = "output_format";
bool fasta = parameters.get_value_from_name<std::string>(par) == "fasta";
if (input_file) {
par = "output_file_path";
......@@ -30,7 +33,7 @@ void significant_sequences(Parameters& parameters) {
std::vector<std::string> line;
std::string temp = "";
std::getline(input_file, temp);
output_file << temp << "\n"; // Copy the header line to the subset output file
if (not fasta) output_file << temp << "\n"; // Copy the header line to the subset output file
line = split(temp, "\t");
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
......@@ -56,7 +59,7 @@ void significant_sequences(Parameters& parameters) {
int sex_count[3] = {0, 0, 0}; // Index: 0 = male, 1 = female, 2 = no sex information
double chi_squared = 0, p = 0;
std::map<std::string, double> candidate_sequences;
std::map<std::string, double[3]> candidate_sequences;
do {
......@@ -84,7 +87,9 @@ void significant_sequences(Parameters& parameters) {
chi_squared = get_chi_squared(sex_count[0], sex_count[1], total_males, total_females);
p = get_chi_squared_p(chi_squared);
if (p < 0.05) { // First pass: we filter sequences with at least one male or one female and non-corrected p < 0.05
candidate_sequences[temp_line] = p;
candidate_sequences[temp_line][0] = p;
candidate_sequences[temp_line][1] = sex_count[0];
candidate_sequences[temp_line][2] = sex_count[1];
}
}
// Reset variables
......@@ -107,8 +112,13 @@ void significant_sequences(Parameters& parameters) {
// Second pass: filter with bonferroni
for (auto sequence: candidate_sequences) {
if (sequence.second < significance_threshold) {
output_file << sequence.first << "\n";
if (sequence.second[0] < significance_threshold) {
if (fasta) {
line = split(sequence.first, "\t");
output_file << ">" << line[0] << "_" << int(sequence.second[1]) << "M_" << int(sequence.second[2]) << "F_cov:" << min_cov + 1 << "_p:" << sequence.second[0] << "\n" << line[1] << "\n";
} else {
output_file << sequence.first << "\n";
}
}
}
......
......@@ -35,6 +35,9 @@ void subset(Parameters& parameters) {
par = "max_individuals";
int max_individuals = parameters.get_value_from_name<int>(par) + 1; // +1 allows comparison with < instead of <=
par = "output_format";
bool fasta = parameters.get_value_from_name<std::string>(par) == "fasta";
if (input_file) {
par = "output_file_path";
......@@ -45,7 +48,7 @@ void subset(Parameters& parameters) {
std::vector<std::string> line;
std::string temp = "";
std::getline(input_file, temp);
output_file << temp << "\n"; // Copy the header line to the subset output file
if (not fasta) output_file << temp << "\n"; // Copy the header line to the subset output file
line = split(temp, "\t");
// Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
......@@ -93,9 +96,13 @@ void subset(Parameters& parameters) {
case '\n': // New line (also a new field)
if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]]; // Increment the appropriate counter
n_individuals = sex_count[0] + sex_count[1] + sex_count[2];
if (sex_count[0] > min_males and sex_count[0] < max_males and sex_count[1] > min_females and sex_count[1] < max_females and
n_individuals > min_individuals and n_individuals < max_individuals) {
output_file << temp_line << "\n";
if (sex_count[0] > min_males and sex_count[0] < max_males and sex_count[1] > min_females and sex_count[1] < max_females and n_individuals > min_individuals and n_individuals < max_individuals) {
if (fasta) {
line = split(temp_line, "\t");
output_file << ">" << line[0] << "_" << sex_count[0] << "M_" << sex_count[1] << "F_cov:" << min_cov + 1 << "\n" << line[1] << "\n";
} else {
output_file << temp_line << "\n";
}
}
// Reset variables
temp = "";
......
......@@ -11,6 +11,8 @@ uint8_t revcomp_table[] = {
'p', 'q', 'y', 's', 'a', 'a', 'b', 'w', 'x', 'r', 'z', 123, 124, 125, 126, 127
};
// Output current date and time in format specified in utils.h
char* print_time (char *buff) {
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment