Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
khalid
radsex
Commits
7729fcb7
Commit
7729fcb7
authored
Feb 07, 2018
by
Romain Feron
Browse files
Implemented locus grouping analysis
parent
0c13bdac
Changes
8
Hide whitespace changes
Inline
Side-by-side
radsex.pro
View file @
7729fcb7
...
...
@@ -18,7 +18,8 @@ HEADERS += \
src
/
subset
.
h
\
src
/
frequencies
.
h
\
src
/
demultiplexing
.
h
\
src
/
barcodes_file
.
h
src
/
barcodes_file
.
h
\
src
/
group_loci
.
h
SOURCES
+=
\
src
/
main
.
cpp
\
...
...
@@ -32,4 +33,5 @@ SOURCES += \
src
/
subset
.
cpp
\
src
/
frequencies
.
cpp
\
src
/
demultiplexing
.
cpp
\
src
/
barcodes_file
.
cpp
src
/
barcodes_file
.
cpp
\
src
/
group_loci
.
cpp
src/group_loci.cpp
0 → 100644
View file @
7729fcb7
#include "group_loci.h"
void
group_loci
(
Parameters
&
parameters
)
{
std
::
mutex
results_mutex
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>
results
;
std
::
string
par
=
"coverage_matrix_path"
;
std
::
string
coverage_matrix_path
=
parameters
.
get_value_from_name
<
std
::
string
>
(
par
);
par
=
"min_cov"
;
int
min_cov
=
parameters
.
get_value_from_name
<
int
>
(
par
)
-
1
;
// -1 to compare with > instead of >= later
std
::
cout
<<
" - Loading coverage matrix :
\n
"
<<
std
::
endl
;
std
::
vector
<
Locus
>
coverage_matrix
=
load_coverage_matrix
(
coverage_matrix_path
,
min_cov
,
true
);
par
=
"input_file_path"
;
std
::
string
input_file_path
=
parameters
.
get_value_from_name
<
std
::
string
>
(
par
);
std
::
vector
<
Locus
>
sequences
=
load_coverage_matrix
(
input_file_path
,
1
,
false
);
par
=
"max_distance"
;
int
max_distance
=
parameters
.
get_value_from_name
<
int
>
(
par
)
+
1
;
// +1 to compare with < instead of <= later
std
::
cout
<<
std
::
endl
<<
" - Starting alignments"
<<
std
::
endl
;
auto
sequence
=
sequences
.
begin
();
// Create and run all sequence processors
par
=
"n_threads"
;
std
::
vector
<
std
::
thread
>
threads
;
for
(
int
i
=
0
;
i
<
parameters
.
get_value_from_name
<
int
>
(
par
);
++
i
)
{
threads
.
push_back
(
std
::
thread
(
sequence_processor
,
std
::
ref
(
sequence
),
std
::
ref
(
sequences
),
std
::
ref
(
coverage_matrix
),
std
::
ref
(
results
),
std
::
ref
(
results_mutex
),
max_distance
));
}
for
(
auto
&
t
:
threads
)
t
.
join
();
par
=
"output_file_path"
;
std
::
string
output_file_path
=
parameters
.
get_value_from_name
<
std
::
string
>
(
par
);
output_group_loci
(
output_file_path
,
results
);
}
std
::
vector
<
Locus
>
load_coverage_matrix
(
std
::
string
&
file_path
,
int
min_cov
,
bool
print
)
{
/* Load a coverage matrix in memory
*/
std
::
ifstream
input_file
(
file_path
);
std
::
vector
<
Locus
>
coverage_matrix
;
if
(
input_file
)
{
// First line is the header. The header is not used here.
std
::
string
temp
;
std
::
getline
(
input_file
,
temp
);
temp
=
""
;
// Define variables used to read the file
char
buffer
[
65536
];
uint
k
=
0
,
field_n
=
0
,
seq_count
=
0
;
Locus
locus
;
bool
keep_locus
=
false
;
do
{
// Read a chunk of size given by the buffer
input_file
.
read
(
buffer
,
sizeof
(
buffer
));
k
=
input_file
.
gcount
();
for
(
uint
i
=
0
;
i
<
k
;
++
i
)
{
// Read the buffer character by character
switch
(
buffer
[
i
])
{
case
'\r'
:
break
;
case
'\t'
:
// New field
switch
(
field_n
)
{
case
0
:
locus
.
id
=
temp
;
break
;
case
1
:
locus
.
sequence
=
temp
;
break
;
default:
locus
.
coverage
.
push_back
(
temp
);
break
;
}
temp
=
""
;
++
field_n
;
break
;
case
'\n'
:
// New line (also a new field)
locus
.
coverage
.
push_back
(
temp
);
keep_locus
=
false
;
for
(
auto
c
:
locus
.
coverage
)
{
if
(
std
::
stoi
(
c
)
>
min_cov
)
{
keep_locus
=
true
;
break
;
}
}
if
(
keep_locus
)
coverage_matrix
.
push_back
(
locus
);
if
(
seq_count
%
1000000
==
0
and
print
)
std
::
cout
<<
"
\t
- Processed "
<<
seq_count
/
1000000
<<
" M. lines and stored "
<<
coverage_matrix
.
size
()
<<
" sequences
\n
"
;
// Reset variables
locus
.
id
=
""
;
locus
.
sequence
=
""
;
locus
.
coverage
.
resize
(
0
);
temp
=
""
;
field_n
=
0
;
++
seq_count
;
break
;
default:
temp
+=
buffer
[
i
];
break
;
}
}
}
while
(
input_file
);
}
return
coverage_matrix
;
}
void
sequence_processor
(
std
::
vector
<
Locus
>::
iterator
&
sequence
,
std
::
vector
<
Locus
>&
sequences
,
std
::
vector
<
Locus
>&
coverage_matrix
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
,
std
::
mutex
&
results_mutex
,
int
max_distance
)
{
while
(
sequence
!=
sequences
.
end
())
{
results_mutex
.
lock
();
Locus
locus
=
*
sequence
;
++
sequence
;
results_mutex
.
unlock
();
process_sequence
(
locus
,
coverage_matrix
,
results
,
results_mutex
,
max_distance
);
}
}
void
process_sequence
(
Locus
&
locus
,
std
::
vector
<
Locus
>&
coverage_matrix
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
,
std
::
mutex
&
results_mutex
,
int
max_distance
)
{
uint
seq_len
=
locus
.
sequence
.
size
();
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>
temp_results
;
const
char
*
seq
=
locus
.
sequence
.
c_str
();
EdlibAlignResult
alignment_result
;
for
(
auto
l
:
coverage_matrix
)
{
alignment_result
=
edlibAlign
(
seq
,
seq_len
,
l
.
sequence
.
c_str
(),
seq_len
,
edlibDefaultAlignConfig
());
if
(
alignment_result
.
status
==
EDLIB_STATUS_OK
and
alignment_result
.
editDistance
<
max_distance
)
{
temp_results
[
locus
.
id
].
push_back
(
l
);
}
edlibFreeAlignResult
(
alignment_result
);
}
results_mutex
.
lock
();
for
(
auto
l
:
temp_results
)
results
[
l
.
first
]
=
l
.
second
;
results_mutex
.
unlock
();
}
src/group_loci.h
0 → 100644
View file @
7729fcb7
#pragma once
#include <string>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <thread>
#include <mutex>
#include "utils.h"
#include "parameters.h"
#include "popmap_file.h"
#include "output.h"
#include "edlib/edlib.h"
// Main function implementing the analysis
void
group_loci
(
Parameters
&
parameters
);
// Load the coverage matrix in memory
std
::
vector
<
Locus
>
load_coverage_matrix
(
std
::
string
&
file_path
,
int
min_cov
,
bool
print
);
// Pick the next sequence from the sequences vector and process it
void
sequence_processor
(
std
::
vector
<
Locus
>::
iterator
&
sequence
,
std
::
vector
<
Locus
>&
sequences
,
std
::
vector
<
Locus
>&
coverage_matrix
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
,
std
::
mutex
&
results_mutex
,
int
max_distance
);
// Process a sequence (look for similar sequences)
void
process_sequence
(
Locus
&
locus
,
std
::
vector
<
Locus
>&
coverage_matrix
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
,
std
::
mutex
&
results_mutex
,
int
max_distance
);
src/output.cpp
View file @
7729fcb7
#include "output.h"
#include <iostream>
void
output_process_reads
(
std
::
string
&
output_file_path
,
std
::
vector
<
std
::
string
>&
individuals
,
std
::
unordered_map
<
std
::
string
,
std
::
unordered_map
<
std
::
string
,
uint16_t
>>&
results
)
{
/* Input:
...
...
@@ -60,3 +61,53 @@ void output_sex_distribution(std::string& output_file_path, std::unordered_map<u
i
=
0
;
}
}
void
output_group_loci
(
std
::
string
&
output_file_path
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
)
{
/* Input:
* - Path to an output file
* - A matrix of loci [Sequence ID: [Associated sequences]]
* Output:
* - A table with following columns:
* Locus ID | Sequence ID | Sequence Status | Sequence | Cov Ind. 1 | Cov Ind. 2 ...
*/
std
::
ofstream
output_file
;
output_file
.
open
(
output_file_path
);
std
::
string
seq_id
;
uint
locus_id
=
0
;
for
(
auto
sequence
:
results
)
{
seq_id
=
sequence
.
first
;
// First iteration to output the original sequence
for
(
auto
locus
:
sequence
.
second
)
{
if
(
locus
.
id
==
seq_id
)
{
output_file
<<
locus_id
<<
"
\t
"
<<
locus
.
id
<<
"
\t
"
<<
locus
.
sequence
<<
"
\t
"
;
if
(
seq_id
==
locus
.
id
)
output_file
<<
"Original"
<<
"
\t
"
;
else
output_file
<<
"Recovered"
<<
"
\t
"
;
for
(
uint
i
=
0
;
i
<
locus
.
coverage
.
size
();
++
i
)
{
output_file
<<
locus
.
coverage
[
i
];
if
(
i
<
locus
.
coverage
.
size
()
-
1
)
output_file
<<
"
\t
"
;
}
output_file
<<
"
\n
"
;
}
}
// Second iteration to output the other sequences (this is not optimized but this step is not intensive anyway)
for
(
auto
locus
:
sequence
.
second
)
{
if
(
locus
.
id
!=
seq_id
)
{
output_file
<<
locus_id
<<
"
\t
"
<<
locus
.
id
<<
"
\t
"
<<
locus
.
sequence
<<
"
\t
"
;
if
(
seq_id
==
locus
.
id
)
output_file
<<
"Original"
<<
"
\t
"
;
else
output_file
<<
"Recovered"
<<
"
\t
"
;
for
(
uint
i
=
0
;
i
<
locus
.
coverage
.
size
();
++
i
)
{
output_file
<<
locus
.
coverage
[
i
];
if
(
i
<
locus
.
coverage
.
size
()
-
1
)
output_file
<<
"
\t
"
;
}
output_file
<<
"
\n
"
;
}
}
++
locus_id
;
}
}
src/output.h
View file @
7729fcb7
...
...
@@ -3,10 +3,13 @@
#include <vector>
#include <unordered_map>
#include <fstream>
#include "utils.h"
// Create output file for the process reads analysis
void
output_process_reads
(
std
::
string
&
output_file_path
,
std
::
vector
<
std
::
string
>&
individuals
,
std
::
unordered_map
<
std
::
string
,
std
::
unordered_map
<
std
::
string
,
uint16_t
>>&
results
);
// Create output file for the sex_distribution analysis
void
output_sex_distribution
(
std
::
string
&
output_file_path
,
std
::
unordered_map
<
uint
,
std
::
unordered_map
<
uint
,
uint
>>&
results
);
// Create output file for the group_loci analysis
void
output_group_loci
(
std
::
string
&
output_file_path
,
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
Locus
>>&
results
);
src/parameters.h
View file @
7729fcb7
...
...
@@ -14,13 +14,16 @@ struct Parameters {
// Initialize all possible parameters to be used in the analyses
// Arguments: name, help message, flag, default value, type, internal type, value, required
// Flags : -h, -f, -d, -o, -u, -t, -c, -p, -b, -a, -m, --min-males, --min-females, --max-males, --max-females
std
::
vector
<
Parameter
>
list
{
Parameter
(
"help"
,
"Prints this message"
,
"-h"
,
"0"
,
"bool"
,
"bool"
,
""
,
false
),
Parameter
(
"input_file_path"
,
"Path to an input file"
,
"-f"
,
""
,
"string"
,
"ifile"
,
""
,
true
),
Parameter
(
"input_dir_path"
,
"Path to an input directory"
,
"-d"
,
""
,
"string"
,
"dir"
,
""
,
true
),
Parameter
(
"output_file_path"
,
"Path to an output file"
,
"-o"
,
""
,
"string"
,
"ofile"
,
""
,
true
),
Parameter
(
"output_dir_path"
,
"Path to an output directory"
,
"-u"
,
""
,
"string"
,
"dir"
,
""
,
true
),
Parameter
(
"coverage_matrix_path"
,
"Path to an coverage matrix file"
,
"-a"
,
""
,
"string"
,
"ifile"
,
""
,
true
),
Parameter
(
"n_threads"
,
"Number of threads"
,
"-t"
,
"1"
,
"int"
,
"int"
,
""
,
false
),
Parameter
(
"min_cov"
,
"Minimum coverage to consider a marker"
,
"-c"
,
"1"
,
"int"
,
"int"
,
""
,
false
),
Parameter
(
"max_distance"
,
"Maximum distance between two sequences in a locus"
,
"-m"
,
"1"
,
"int"
,
"int"
,
""
,
false
),
Parameter
(
"popmap_file_path"
,
"Path to a popmap file"
,
"-p"
,
""
,
"string"
,
"ifile"
,
""
,
true
),
Parameter
(
"barcodes_file_path"
,
"Path to a barcodes file"
,
"-b"
,
""
,
"string"
,
"ifile"
,
""
,
true
),
Parameter
(
"min_males"
,
"Minimum number of males in the subset"
,
"--min-males"
,
"0"
,
"int"
,
"int"
,
""
,
false
),
...
...
src/radsex.h
View file @
7729fcb7
...
...
@@ -7,6 +7,7 @@
#include "subset.h"
#include "frequencies.h"
#include "demultiplexing.h"
#include "group_loci.h"
class
RadSex
{
...
...
@@ -38,6 +39,10 @@ class RadSex {
// {"demultiplexing", Analysis("demultiplexing", "Demultiplexes a set of reads files",
// std::vector<std::string> {"input_file_path", "output_dir_path", "barcodes_file_path", "min_cov"},
// demultiplexing)},
{
"group_loci"
,
Analysis
(
"group_loci"
,
"Recreate polymorphic loci from a subset of coverage matrix"
,
std
::
vector
<
std
::
string
>
{
"input_file_path"
,
"coverage_matrix_path"
,
"output_file_path"
,
"max_distance"
,
"n_threads"
,
"min_cov"
},
group_loci
)},
};
// In the constructor, the type of analysis is detected and all analysis objects are initialized
...
...
src/utils.h
View file @
7729fcb7
...
...
@@ -12,6 +12,11 @@
#define DTTMFMT "%Y-%m-%d %H:%M:%S"
#define DTTMSZ 21
struct
Locus
{
std
::
string
id
;
std
::
string
sequence
;
std
::
vector
<
std
::
string
>
coverage
;
};
// Output current date and time in format specified with DMTTMFMT and DTTMSZ
char
*
print_time
(
char
*
buff
);
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment