Commit 5b71a432 authored by Romain Feron's avatar Romain Feron
Browse files

Optimized multithreading

parent 071a21c0
input_file = open('./test.tsv')
header = input_file.readline()
header = header[:-1].split('\t')
min_males = 1
min_females = 1
min_cov = 0
popmap_file = open('./popmap.tsv')
popmap = {line.split('\t')[0]: line[:-1].split('\t')[1] for line in popmap_file if line[:-1]}
n_males = [p for p in popmap.values()].count('M')
n_females = [p for p in popmap.values()].count('F')
males_col = [i for i, h in enumerate(header) if h in popmap.keys() and popmap[h] is 'M']
females_col = [i for i, h in enumerate(header) if h in popmap.keys() and popmap[h] is 'F']
print(n_females, n_males)
output_file = open('./haplotypes.tsv', 'w')
output_file.write('Locus' + '\t' +
'Males' + '\t' +
'Females' + '\t' +
'Sequence' + '\n')
for i, line in enumerate(input_file):
if i % 1000000 == 0:
print(str(i))
temp = line[:-1].split('\t')
females = len([t for j, t in enumerate(temp) if j in females_col and int(t) > min_cov])
males = len([t for j, t in enumerate(temp) if j in males_col and int(t) > min_cov])
if males > min_males and females > min_females:
output_file.write(str(temp[0]) + '\t' +
str(males) + '\t' +
str(females) + '\t' +
temp[1] + '\n')
......@@ -33,6 +33,7 @@ void file_processor(std::vector<InputFile>& input_files, std::unordered_map<std:
if (not it->processed) {
it->processed = true;
remaining_files = true;
files_mutex.unlock();
process_file(*it, results, results_mutex);
break;
} else {
......
......@@ -3,9 +3,9 @@
void process_file(InputFile file, std::unordered_map<std::string, std::unordered_map<std::string, uint16_t>>& results, std::mutex& results_mutex) {
std::unordered_map<std::string, uint16_t> temp_results;
igzstream input_file(file.path.c_str());
std::string line;
bool write_next_line = false;
while(std::getline(input_file, line)) {
switch (line[0]) {
......@@ -17,13 +17,19 @@ void process_file(InputFile file, std::unordered_map<std::string, std::unordered
break;
default:
if (write_next_line) {
results_mutex.lock();
++results[line][file.individual_name];
results_mutex.unlock();
++temp_results[line];
}
break;
}
}
results_mutex.lock();
for (auto sequence : temp_results) {
results[sequence.first][file.individual_name] += sequence.second;
}
results_mutex.unlock();
std::cout << " - Finished processing individual : " + file.individual_name + "." << std::endl;
return;
}
......@@ -2,6 +2,7 @@
#include <mutex>
#include <unordered_map>
#include "gzstream.h"
#include "zlib.h"
#include "input_file.h"
#include "utils.h"
......
......@@ -7,7 +7,7 @@ SOURCES += \
src/main.cpp \
src/arg_parser.cpp \
src/utils.cpp \
analysis.cpp \
src/analysis.cpp \
src/analysis.cpp \
src/input_dir.cpp \
src/process_file.cpp \
......@@ -18,7 +18,7 @@ HEADERS += \
src/arg_parser.h \
src/parameters.h \
src/utils.h \
analysis.h \
src/analysis.h \
src/analysis.h \
src/input_dir.h \
src/process_file.h \
......
date
time ./bin/stacks_replacement -i ./test/samples -o test/test.tsv -t 1
date
date
time ./bin/stacks_replacement -i ./test/samples -o test/test.tsv -t 3
date
## Current implementation
Threads: 1
Start: 08:36:36
End: 08:40:36
--> 240s
Threads: 3
Start: 08:40:36
End: 08:44:36
--> 240s
## Just reading the files
Threads: 1
Start: 08:45:28
End: 08:47:16
--> 108s
Threads: 3
Start: 08:47:16
End: 08:48:00
--> 44s
## Copy map at the end
Threads: 1
Start: 09:27:20
End: 09:31:15
--> 235s
Threads: 3
Start: 09:31:15
End: 09:34:07
--> 172s
## Buffer 2048 + Copy map at the end
Threads: 1
Start: 10:11:01
End: 10:15:23
--> 262s
Threads: 3
Start: 10:15:23
End: 10:18:32
--> 189s
## Buffer 65536 + Copy map at the end
Threads: 1
Start: 10:20:32
End: 10:24:49
--> 257s
Threads: 3
Start: 10:24:49
End: 10:27:41
--> 172s
## Buffer 1048576 + Copy map at the end
Threads: 1
Start: 10:35:58
End: 10:40:17
--> 259s
Threads: 3
Start: 10:40:17
End: 10:43:15
--> 178s
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment