Commit 3f53a6e9 authored by Romain Feron's avatar Romain Feron
Browse files

Removed unused code files, cleaned up file and function names, merged some...

Removed unused code files, cleaned up file and function names, merged some files for better organization
parent 3796d418
TEMPLATE = app
CONFIG += console c++11
CONFIG -= app_bundle
CONFIG -= qt
SOURCES += main.cpp \
include/edlib/edlib.cpp \
include/kfun/kfun.cpp \
src/barcodes_file.cpp \
src/demultiplexing.cpp \
src/frequencies.cpp \
src/group_loci.cpp \
src/main.cpp \
src/mapping.cpp \
src/output.cpp \
src/parameters.cpp \
src/popmap_file.cpp \
src/process_reads.cpp \
src/radsex.cpp \
src/scaffold_lengths.cpp \
src/sequence_file.cpp \
src/sex_distribution.cpp \
src/significant_sequences.cpp \
src/stats.cpp \
src/subset.cpp \
src/utils.cpp \
include/bwa/bntseq.c \
include/bwa/bwa.c \
include/bwa/bwamem.c \
include/bwa/bwamem_extra.c \
include/bwa/bwamem_pair.c \
include/bwa/bwt.c \
include/bwa/bwt_gen.c \
include/bwa/bwtindex.c \
include/bwa/is.c \
include/bwa/kstring.c \
include/bwa/ksw.c \
include/bwa/kthread.c \
include/bwa/malloc_wrap.c \
include/bwa/QSufSort.c \
include/bwa/rle.c \
include/bwa/rope.c \
include/bwa/utils.c \
src/depth.cpp
HEADERS += \
include/bwa/bntseq.h \
include/bwa/bwa.h \
include/bwa/bwamem.h \
include/bwa/bwt.h \
include/bwa/kbtree.h \
include/bwa/khash.h \
include/bwa/kseq.h \
include/bwa/ksort.h \
include/bwa/kstring.h \
include/bwa/ksw.h \
include/bwa/kvec.h \
include/bwa/malloc_wrap.h \
include/bwa/QSufSort.h \
include/bwa/rle.h \
include/bwa/rope.h \
include/bwa/utils.h \
include/edlib/edlib.h \
include/kfun/kfun.h \
include/kseq/kseq.h \
src/analysis.h \
src/barcodes_file.h \
src/demultiplexing.h \
src/frequencies.h \
src/group_loci.h \
src/mapping.h \
src/output.h \
src/parameter.h \
src/parameters.h \
src/popmap_file.h \
src/process_reads.h \
src/radsex.h \
src/scaffold_lengths.h \
src/sequence_file.h \
src/sex_distribution.h \
src/significant_sequences.h \
src/stats.h \
src/subset.h \
src/utils.h \
src/depth.h
......@@ -3,7 +3,6 @@
#include "parameters.h"
struct Analysis {
/* The Analysis structure stores information about an analysis:
......
#include "barcodes_file.h"
std::unordered_map<std::string, std::string> load_barcodes(Parameters& parameters) {
/* Load a barcodes file.
* Input: a tabulated file with columns Barcode Name
* Data is stored in a map of structure Barcode (string) Name (string)
* TODO (maybe): check that barcodes are actually DNA
*/
std::ifstream file;
std::string par = "barcodes_file_path";
file.open(parameters.get_value_from_name<std::string>(par));
std::unordered_map<std::string, std::string> results;
if (file) {
std::string line;
std::vector<std::string> fields;
while(std::getline(file, line)) {
fields = split(line, "\t");
if (fields.size() == 2) { // Only 2 columns as it should be
results[fields[0]] = fields[1];
}
}
} else {
std::cout << "**Error: cannot open barcodes file \"" << parameters.get_value_from_name<std::string>(par) << "\"";
exit(0);
}
return results;
}
#pragma once
#include <unordered_map>
#include "parameters.h"
#include "utils.h"
// Load a barcodes file
std::unordered_map<std::string, std::string> load_barcodes(Parameters& parameters);
......@@ -5,7 +5,7 @@
#include <unordered_map>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "popmap.h"
#include "output.h"
......
#include "sex_distribution.h"
#include "distrib.h"
void sex_distribution(Parameters& parameters) {
void distrib(Parameters& parameters) {
/* The sex_distribution function parses through a file generated by process_reads and checks for each sequence
* the number of males and females in which the sequence was found. The output is a table with five columns:
......
......@@ -5,9 +5,9 @@
#include <unordered_map>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "popmap.h"
#include "output.h"
// Main function implementing the analysis
void sex_distribution(Parameters& parameters);
void distrib(Parameters& parameters);
#include "frequencies.h"
#include "freq.h"
void frequencies(Parameters &parameters) {
void freq(Parameters &parameters) {
/* The frequencies function parses through a file generated by process_reads to compute the distribution of markers
* between individuals. The output file has the following structure:
......
......@@ -5,9 +5,9 @@
#include <map>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "popmap.h"
#include "output.h"
// Main function implementing the analysis
void frequencies(Parameters& parameters);
void freq(Parameters& parameters);
#include "group_loci.h"
#include "loci.h"
void group_loci(Parameters& parameters) {
void loci(Parameters& parameters) {
std::mutex results_mutex;
......@@ -12,38 +12,50 @@ void group_loci(Parameters& parameters) {
par = "min_cov";
int min_cov = parameters.get_value_from_name<int>(par) - 1; // -1 to compare with > instead of >= later
std::cout << " - Loading coverage matrix :\n" << std::endl;
par = "freq_het";
float freq_het = parameters.get_value_from_name<float>(par);
par = "freq_hom";
float freq_hom = parameters.get_value_from_name<float>(par);
par = "range_het";
float range_het = parameters.get_value_from_name<float>(par);
par = "range_hom";
float range_hom = parameters.get_value_from_name<float>(par);
std::vector<std::string> header;
std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header);
par = "input_file_path";
std::string input_file_path = parameters.get_value_from_name<std::string>(par);
std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header);
// std::cout << " - Loading subset of loci :" << std::endl;
// par = "input_file_path";
// std::string input_file_path = parameters.get_value_from_name<std::string>(par);
// std::vector<Locus> sequences = load_coverage_matrix(input_file_path, 1, false, header);
par = "max_distance";
int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
// std::cout << " - Loading coverage table :" << std::endl;
std::cout << std::endl << " - Starting alignments" << std::endl;
// std::vector<Locus> coverage_matrix = load_coverage_matrix(coverage_matrix_path, min_cov, true, header);
auto sequence = sequences.begin();
// par = "max_distance";
// int max_distance = parameters.get_value_from_name<int>(par) + 1; // +1 to compare with < instead of <= later
// Create and run all sequence processors
par = "n_threads";
std::vector<std::thread> threads;
for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
}
// std::cout << std::endl << " - Starting alignments" << std::endl;
// auto sequence = sequences.begin();
for (auto &t : threads) t.join();
// // Create and run all sequence processors
// par = "n_threads";
// std::vector<std::thread> threads;
// for (int i=0; i<parameters.get_value_from_name<int>(par); ++i) {
// threads.push_back(std::thread(sequence_processor, std::ref(sequence), std::ref(sequences), std::ref(coverage_matrix), std::ref(results), std::ref(results_mutex), max_distance));
// }
par = "output_file_path";
std::string output_file_path = parameters.get_value_from_name<std::string>(par);
output_group_loci(output_file_path, results, header);
// for (auto &t : threads) t.join();
// par = "output_file_path";
// std::string output_file_path = parameters.get_value_from_name<std::string>(par);
// output_group_loci(output_file_path, results, header);
}
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header) {
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom) {
/* Load a coverage matrix in memory
*/
......@@ -56,7 +68,7 @@ std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, boo
// First line is the header. The header is used when printing the output.
std::string temp;
std::getline(input_file, temp);
if (print) header = split(temp,"\t");
header = split(temp,"\t");
temp = "";
// Define variables used to read the file
......@@ -69,52 +81,60 @@ std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, boo
// Read a chunk of size given by the buffer
input_file.read(buffer, sizeof(buffer));
k = input_file.gcount();
k = static_cast<uint>(input_file.gcount());
for (uint i=0; i<k; ++i) {
// Read the buffer character by character
switch(buffer[i]) {
case '\r':
break;
case '\t': // New field
switch(field_n) {
case 0:
locus.id = temp;
case '\t': // New field
switch(field_n) {
case 0:
locus.id = temp;
break;
case 1:
locus.sequence =temp;
break;
default:
locus.coverage.push_back(temp);
break;
}
temp = "";
++field_n;
break;
case 1:
locus.sequence =temp;
case '\n': // New line (also a new field)
locus.coverage.push_back(temp);
keep_locus = false;
for (auto c: locus.coverage) {
if (std::stoi(c) > min_cov) {
keep_locus = true;
break;
}
}
if (keep_locus) coverage_matrix.push_back(locus);
if (seq_count % 1000000 == 0 and seq_count > 0 and print) std::cout << " - Processed " << seq_count / 1000000 <<
" M. lines and stored " << coverage_matrix.size() << " sequences\n";
// Reset variables
locus.id = "";
locus.sequence = "";
locus.coverage.resize(0);
temp = "";
field_n = 0;
++seq_count;
break;
default:
locus.coverage.push_back(temp);
temp += buffer[i];
break;
}
temp = "";
++field_n;
break;
case '\n': // New line (also a new field)
locus.coverage.push_back(temp);
keep_locus = false;
for (auto c: locus.coverage) {
if (std::stoi(c) > min_cov) {
keep_locus = true;
break;
}
}
if (keep_locus) coverage_matrix.push_back(locus);
if (seq_count % 1000000 == 0 and print) std::cout << "\t- Processed " << seq_count / 1000000 << " M. lines and stored " << coverage_matrix.size() << " sequences\n";
// Reset variables
locus.id = "";
locus.sequence = "";
locus.coverage.resize(0);
temp = "";
field_n = 0;
++seq_count;
break;
default:
temp += buffer[i];
break;
}
}
} while (input_file);
......@@ -143,7 +163,7 @@ void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locu
void process_sequence(Locus& locus, std::vector<Locus>& coverage_matrix, std::unordered_map<std::string, std::vector<Locus>>& results, std::mutex& results_mutex, int max_distance) {
uint seq_len = locus.sequence.size();
uint seq_len = static_cast<uint>(locus.sequence.size());
std::unordered_map<std::string, std::vector<Locus>> temp_results;
const char* seq = locus.sequence.c_str();
......
......@@ -7,16 +7,16 @@
#include <mutex>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "popmap.h"
#include "output.h"
#include "edlib/edlib.h"
// Main function implementing the analysis
void group_loci(Parameters& parameters);
void loci(Parameters& parameters);
// Load the coverage matrix in memory
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header);
std::vector<Locus> load_coverage_matrix(std::string& file_path, int min_cov, bool print, std::vector<std::string>& header, float freq_het, float freq_hom, float range_het, float range_hom);
// Pick the next sequence from the sequences vector and process it
void sequence_processor(std::vector<Locus>::iterator& sequence, std::vector<Locus>& sequences, std::vector<Locus>& coverage_matrix,
......
#include "radsex.h"
#include "mapping.h"
#include "map.h"
int main(int argc, char* argv[]) {
......
#include "mapping.h"
#include "map.h"
void mapping(Parameters& parameters) {
void map(Parameters& parameters) {
/* The mapping function maps all sequences from a coverage table (obtained from process, subset, or signif functions) to a reference genome and
* generates an output file with the following format:
......@@ -218,3 +218,33 @@ void mapping(Parameters& parameters) {
free(opt);
bwa_idx_destroy(index);
}
void scaffold_lengths(const std::string& genome_file_path) {
std::ifstream genome_file(genome_file_path);
std::ofstream lengths_file(genome_file_path + ".lengths");
std::string line, scaffold_name;
uint scaffold_length = 0;
bool start = true;
while(std::getline(genome_file, line)) {
if (line[0] == '>') {
if (not start) {
lengths_file << scaffold_name << "\t" << scaffold_length << "\n";
} else {
start = false;
}
scaffold_name = split(line, " ")[0];
scaffold_name = scaffold_name.substr(1, scaffold_name.size());
scaffold_length = 0;
} else {
scaffold_length += line.size();
}
}
lengths_file << scaffold_name << "\t" << scaffold_length << "\n";
genome_file.close();
lengths_file.close();
}
#pragma once
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "popmap.h"
#include "output.h"
#include "stats.h"
#include "scaffold_lengths.h"
#include "bwa/bwamem.h"
#include "bwa/kseq.h"
void mapping(Parameters& parameters);
void map(Parameters& parameters);
void scaffold_lengths(const std::string& genome_file_path);
......@@ -4,7 +4,6 @@
#include <dirent.h>
struct Parameter {
/* The Parameter structure stores the information necessary to use and process the parameters
......
......@@ -37,7 +37,11 @@ struct Parameters {
Parameter("output_matrix", "Output the sex distribution table as a matrix", "--output-matrix", "0", "bool", "bool", "", false),
Parameter("min_quality", "Minimum mapping quality to keep a mapped read", "--min-quality", "20", "int", "int", "", false),
Parameter("min_frequency", "Minimum frequency of a sequence in at least one sex", "--min-frequency", "0.25", "float", "float", "", false),
Parameter("output_format", "Output format, either \"table\" or \"fasta\"", "--output-format", "table", "string", "string", "", false)
Parameter("output_format", "Output format, either \"table\" or \"fasta\"", "--output-format", "table", "string", "string", "", false),
Parameter("freq_het", "Frequency of a heretozygous allele", "--freq-het", "0.5", "float", "float", "", false),
Parameter("freq_hom", "Frequency of a homozygous allele", "--freq-hom", "1", "float", "float", "", false),
Parameter("range_het", "Range for the frequency of a heretozygous allele", "--range-het", "0.1", "float", "float", "", false),
Parameter("range_hom", "Range for the frequency of a homozygous allele", "--range-hom", "0.1", "float", "float", "", false)
};
......
#include "popmap_file.h"
#include "popmap.h"
......
#include "process_reads.h"
#include "process.h"
KSEQ_INIT(gzFile, gzread) // Kseq init macro
// List of currently supported file types with extensions
std::vector<std::string> extensions {".fq.gz", ".fq", ".fastq.gz", ".fastq", ".fasta.gz", ".fasta", ".fa.gz", ".fa", ".fna.gz", ".fna"};
void process_reads(Parameters& parameters) {
std::vector<InputFile> get_input_files(const std::string& input_dir_path) {
/* Gets all relevant files (from supported formats) from the input directory.
* Files are stored as InputFile object which contains all relevant information about the file.
* The function returns a vector of InputFiles.
*/
DIR* dir = opendir(input_dir_path.c_str());
if(!dir) {
std::cout << std::endl << "** Error: could not read the directory \"" << input_dir_path << "\"" << std::endl;
exit(0);
}
struct dirent* dir_content;
std::vector<InputFile> files;
std::string current_file, extension, individual_name;
std::vector<std::string> split_name;
InputFile temp;
while ((dir_content = readdir(dir))) {
current_file = dir_content->d_name;
split_name = split(current_file, ".");
size_t s = split_name.size();
extension = "";
individual_name = split_name[0];
// Get file name and extension (even when there is "." in the file name)
if (s > 1) {
if (split_name[s - 1] == "gz" and s > 2) {
extension = "." + split_name[s - 2] + "." + split_name[s - 1];
for (uint i=1; i<split_name.size() - 2; ++i) individual_name += "." + split_name[i];
} else {
extension = "." + split_name[s - 1];
for (uint i=1; i<split_name.size() - 1; ++i) individual_name += "." + split_name[i];
}
}
if(std::find(extensions.begin(), extensions.end(), extension) != extensions.end()) {
temp.individual_name = individual_name;
temp.path = input_dir_path + current_file;
temp.extension = extension;
temp.processed = false;
files.push_back(temp);
}
}
if (files.size() == 0) {
std::cout << " ** Error: no valid input file found in input directory \"" << input_dir_path <<"\"." << std::endl;
exit(1);
}
return files;
}
void process(Parameters& parameters) {
/* The process_reads function does the following:
* - retrieve all sequence files from the input directory
......
#pragma once
#include <iostream>
#include <mutex>
#include <string>
#include <unordered_map>
#include <thread>
#include "utils.h"
#include "parameters.h"
#include "sequence_file.h"
#include "output.h"
#include <zlib.h>
#include "kseq/kseq.h"
#include "output.h"
#include "parameters.h"
#include "utils.h"
struct InputFile {
/* The InputFile structure stores information about an input file:
* - Path to the file