subset.cpp 5.08 KB
Newer Older
1
2
3
4
5
#include "subset.h"


void subset(Parameters& parameters) {

Romain Feron's avatar
Romain Feron committed
6
7
8
    /* The subset function parses through a file generated by process_reads and outputs sequences with the following criteria:
     * - Found in M males with min_males <= M <= max_males
     * - Found in F females with min_females <= F <= max_females
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
     */

    std::unordered_map<std::string, bool> popmap = load_popmap(parameters);

    std::string par = "input_file_path";
    std::ifstream input_file;
    input_file.open(parameters.get_value_from_name<std::string>(par));

    par = "min_cov";
    int min_cov = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=

    par = "min_males";
    int min_males = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=

    par = "min_females";
    int min_females = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=

    par = "max_males";
    int max_males = parameters.get_value_from_name<int>(par) + 1; // +1 allows comparison with < instead of <=

    par = "max_females";
    int max_females = parameters.get_value_from_name<int>(par) + 1; // +1 allows comparison with < instead of <=

32
33
34
35
36
37
    par = "min_individuals";
    int min_individuals = parameters.get_value_from_name<int>(par) - 1; // -1 allows comparison with > instead of >=

    par = "max_individuals";
    int max_individuals = parameters.get_value_from_name<int>(par) + 1; // +1 allows comparison with < instead of <=

38
39
40
    par = "output_format";
    bool fasta = parameters.get_value_from_name<std::string>(par) == "fasta";

41
42
43
44
45
46
47
48
49
50
    if (input_file) {

        par = "output_file_path";
        std::ofstream output_file;
        output_file.open(parameters.get_value_from_name<std::string>(par));

        // First line is the header. The header is parsed to get the sex of each field in the table.
        std::vector<std::string> line;
        std::string temp = "";
        std::getline(input_file, temp);
51
        if (not fasta) output_file << temp << "\n"; // Copy the header line to the subset output file
52
53
54
55
56
        line = split(temp, "\t");

        // Map with column number --> index of sex_count (0 = male, 1 = female, 2 = no sex)
        std::unordered_map<uint, uint> sex_columns;

57
        // Detection of individuals is based on the popmap, so individuals without sex should still be in the popmap
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
        for (uint i=0; i<line.size(); ++i) {
            if (popmap.find(line[i]) != popmap.end()) {
                if (popmap[line[i]]) {
                    sex_columns[i] = 0; // Male --> column 0
                } else {
                    sex_columns[i] = 1; // Female --> column 1
                }
            } else {
                sex_columns[i] = 2; // First and second columns (id and sequence) are counted as no sex
            }
        }

        // Define variables used to read the file
        char buffer[65536];
        std::string temp_line;
        uint k = 0, field_n = 0;
        int sex_count[3] = {0, 0, 0}; // Index: 0 = male, 1 = female, 2 = no sex
75
        int n_individuals = 0;
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97

        do {

            // Read a chunk of size given by the buffer
            input_file.read(buffer, sizeof(buffer));
            k = input_file.gcount();

            for (uint i=0; i<k; ++i) {

                // Read the buffer character by character
                switch(buffer[i]) {

                case '\r':
                    break;
                case '\t':  // New field
                    if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]];  // Increment the appropriate counter
                    temp = "";
                    temp_line += buffer[i];
                    ++field_n;
                    break;
                case '\n':  // New line (also a new field)
                    if (sex_columns[field_n] != 2 and std::stoi(temp) > min_cov) ++sex_count[sex_columns[field_n]];  // Increment the appropriate counter
98
                    n_individuals = sex_count[0] + sex_count[1] + sex_count[2];
99
100
101
102
103
104
105
                    if (sex_count[0] > min_males and sex_count[0] < max_males and sex_count[1] > min_females and sex_count[1] < max_females and n_individuals > min_individuals and n_individuals < max_individuals) {
                        if (fasta) {
                            line = split(temp_line, "\t");
                            output_file << ">" << line[0] << "_" << sex_count[0] << "M_" << sex_count[1] << "F_cov:" << min_cov + 1 << "\n" << line[1] << "\n";
                        } else {
                            output_file << temp_line << "\n";
                        }
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
                    }
                    // Reset variables
                    temp = "";
                    temp_line = "";
                    field_n = 0;
                    sex_count[0] = 0;
                    sex_count[1] = 0;
                    break;
                default:
                    temp += buffer[i];
                    temp_line += buffer[i];
                    break;
                }
            }

        } while (input_file);

        output_file.close();
        input_file.close();
    }
}