Commit ab86697a authored by Romain Feron's avatar Romain Feron
Browse files

Switched to the new RADSex pipeline implemented in C++.

This pipeline doesn't use Stacks anymore, a replacement for stacks was implemented (still need to implemented process_radtags)
The python pipeline to be used with Stacks was moved to a legacy branch
parent 5b71a432
#### Buffer implementation (can't handle FASTQ correctly)
./bin/radsex process_reads -d ./test/fastq/ -o ./test.tsv 25,72s user 1,32s system 92% cpu 29,297 total
2628535 ./test.tsv
./bin/radsex process_reads -d ./test/fasta/ -o ./test.tsv 23,75s user 2,10s system 77% cpu 33,437 total
2628991 ./test.tsv
./bin/radsex process_reads -d ./test/fastqgz/ -o ./test.tsv 44,65s user 1,52s system 84% cpu 54,520 total
2628535 ./test.tsv
./bin/radsex process_reads -d ./test/fastagz/ -o ./test.tsv 33,05s user 1,04s system 83% cpu 40,755 total
2628991 ./test.tsv
#### Using KSEQ
./bin/radsex process_reads -d ./test/fastq/ -o ./test.tsv 39,05s user 2,14s system 71% cpu 57,846 total
2631957 ./test.tsv
./bin/radsex process_reads -d ./test/fasta/ -o ./test.tsv 27,41s user 1,27s system 87% cpu 32,833 total
2631957 ./test.tsv
./bin/radsex process_reads -d ./test/fastqgz/ -o ./test.tsv 49,03s user 1,07s system 95% cpu 52,449 total
./bin/radsex process_reads -d ./test/fastagz/ -o ./test.tsv 36,66s user 0,86s system 94% cpu 39,526 total
File added
/* The MIT License
Copyright (c) 2008 Genome Research Ltd (GRL).
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/* Contact: Heng Li <lh3@sanger.ac.uk> */
/* Last Modified: 12APR2009 */
#ifndef AC_KSEQ_H
#define AC_KSEQ_H
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define KS_SEP_TAB 1 // isspace() && !' '
#define KS_SEP_MAX 1
#define __KS_TYPE(type_t) \
typedef struct __kstream_t { \
char *buf; \
int begin, end, is_eof; \
type_t f; \
} kstream_t;
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
#define __KS_BASIC(type_t, __bufsize) \
static inline kstream_t *ks_init(type_t f) \
{ \
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
ks->f = f; \
ks->buf = (char*)malloc(__bufsize); \
return ks; \
} \
static inline void ks_destroy(kstream_t *ks) \
{ \
if (ks) { \
free(ks->buf); \
free(ks); \
} \
}
#define __KS_GETC(__read, __bufsize) \
static inline int ks_getc(kstream_t *ks) \
{ \
if (ks->is_eof && ks->begin >= ks->end) return -1; \
if (ks->begin >= ks->end) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end < __bufsize) ks->is_eof = 1; \
if (ks->end == 0) return -1; \
} \
return (int)ks->buf[ks->begin++]; \
}
#ifndef KSTRING_T
#define KSTRING_T kstring_t
typedef struct __kstring_t {
size_t l, m;
char *s;
} kstring_t;
#endif
#ifndef kroundup32
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
#endif
#define __KS_GETUNTIL(__read, __bufsize) \
static int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
{ \
if (dret) *dret = 0; \
str->l = 0; \
if (ks->begin >= ks->end && ks->is_eof) return -1; \
for (;;) { \
int i; \
if (ks->begin >= ks->end) { \
if (!ks->is_eof) { \
ks->begin = 0; \
ks->end = __read(ks->f, ks->buf, __bufsize); \
if (ks->end < __bufsize) ks->is_eof = 1; \
if (ks->end == 0) break; \
} else break; \
} \
if (delimiter > KS_SEP_MAX) { \
for (i = ks->begin; i < ks->end; ++i) \
if (ks->buf[i] == delimiter) break; \
} else if (delimiter == KS_SEP_SPACE) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i])) break; \
} else if (delimiter == KS_SEP_TAB) { \
for (i = ks->begin; i < ks->end; ++i) \
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
} else i = 0; /* never come to here! */ \
if (str->m - str->l < uint(i - ks->begin + 1)) { \
str->m = str->l + (i - ks->begin) + 1; \
kroundup32(str->m); \
str->s = (char*)realloc(str->s, str->m); \
} \
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
str->l = str->l + (i - ks->begin); \
ks->begin = i + 1; \
if (i < ks->end) { \
if (dret) *dret = ks->buf[i]; \
break; \
} \
} \
if (str->l == 0) { \
str->m = 1; \
str->s = (char*)calloc(1, 1); \
} \
str->s[str->l] = '\0'; \
return str->l; \
}
#define KSTREAM_INIT(type_t, __read, __bufsize) \
__KS_TYPE(type_t) \
__KS_BASIC(type_t, __bufsize) \
__KS_GETC(__read, __bufsize) \
__KS_GETUNTIL(__read, __bufsize)
#define __KSEQ_BASIC(type_t) \
static inline kseq_t *kseq_init(type_t fd) \
{ \
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
s->f = ks_init(fd); \
return s; \
} \
static inline void kseq_rewind(kseq_t *ks) \
{ \
ks->last_char = 0; \
ks->f->is_eof = ks->f->begin = ks->f->end = 0; \
} \
static inline void kseq_destroy(kseq_t *ks) \
{ \
if (!ks) return; \
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
ks_destroy(ks->f); \
free(ks); \
}
/* Return value:
>=0 length of the sequence (normal)
-1 end-of-file
-2 truncated quality string
*/
#define __KSEQ_READ \
static int kseq_read(kseq_t *seq) \
{ \
int c; \
kstream_t *ks = seq->f; \
if (seq->last_char == 0) { /* then jump to the next header line */ \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \
if (c == -1) return -1; /* end of file */ \
seq->last_char = c; \
} /* the first header char has been read */ \
seq->comment.l = seq->seq.l = seq->qual.l = 0; \
if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; \
if (c != '\n') ks_getuntil(ks, '\n', &seq->comment, 0); \
while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \
if (isgraph(c)) { /* printable non-space character */ \
if (seq->seq.l + 1 >= seq->seq.m) { /* double the memory */ \
seq->seq.m = seq->seq.l + 2; \
kroundup32(seq->seq.m); /* rounded to next closest 2^k */ \
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
} \
seq->seq.s[seq->seq.l++] = (char)c; \
} \
} \
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
if (c != '+') return seq->seq.l; /* FASTA */ \
if (seq->qual.m < seq->seq.m) { /* allocate enough memory */ \
seq->qual.m = seq->seq.m; \
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
} \
while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \
if (c == -1) return -2; /* we should not stop here */ \
while ((c = ks_getc(ks)) != -1 && seq->qual.l < seq->seq.l) \
if (c >= 33 && c <= 127) seq->qual.s[seq->qual.l++] = (unsigned char)c; \
seq->qual.s[seq->qual.l] = 0; /* null terminated string */ \
seq->last_char = 0; /* we have not come to the next header line */ \
if (seq->seq.l != seq->qual.l) return -2; /* qual string is shorter than seq string */ \
return seq->seq.l; \
}
#define __KSEQ_TYPE(type_t) \
typedef struct { \
kstring_t name, comment, seq, qual; \
int last_char; \
kstream_t *f; \
} kseq_t;
#define KSEQ_INIT(type_t, __read) \
KSTREAM_INIT(type_t, __read, 4096) \
__KSEQ_TYPE(type_t) \
__KSEQ_BASIC(type_t) \
__KSEQ_READ
#endif
......@@ -10,17 +10,17 @@ BIN = $(BASEDIR)/bin
BUILD = $(BASEDIR)/build
INCLUDE = $(BASEDIR)/include
SRC = $(BASEDIR)/src
CPP = $(wildcard $(SRC)/*.cpp)
CPP = $(wildcard $(SRC)/*.cpp) $(wildcard $(SRC)/*/*.cpp)
# Target
TARGET = stacks_replacement
TARGET = radsex
# Variables
OBJS = $(addprefix $(BUILD)/, $(notdir $(CPP:.cpp=.o)))
# Rules
all: init print-OBJS $(TARGET)
all: init print-OBJS print-CPP $(TARGET)
print-% : ; @echo $* = $($*)
......@@ -31,11 +31,11 @@ $(BUILD)/%.o: $(SRC)/%.cpp
$(CC) $(CFLAGS) -I $(INCLUDE) -c -o $@ $^
clean:
rm -rf $(BUILD)/*.o
rm -rf $(BIN)/$(TARGET)
@rm -rf $(BUILD)/*.o
@rm -rf $(BIN)/$(TARGET)
init:
mkdir -p $(BUILD) $(BUILD)
mkdir -p $(BIN) $(BIN)
@mkdir -p $(BUILD) $(BUILD)
@mkdir -p $(BIN) $(BIN)
rebuild: clean $(TARGET)
from radseq_analysis.parser import Parser
Parser()
from radseq_analysis.analyse_directory import analyse_directory
import os
from .shared import Parameters
from .modules import sex_linked_haplotypes
from .file_handler import load_popmap
def analyse_directory(files_dir=None, analyses_dir=None, positions_list=None,
popmap_path=None, visualize=False):
if not os.path.isdir(analyses_dir):
os.mkdir(analyses_dir)
parameters = Parameters(files_dir=files_dir,
results_dir=analyses_dir,
positions_list=positions_list)
load_popmap(popmap_path, parameters)
haplotypes_file_path = os.path.join(files_dir, 'batch_0.haplotypes.tsv')
catalog_file_path = os.path.join(files_dir, 'batch_0.catalog.tags.tsv.gz')
sex_linked_haplotypes(haplotypes_file_path, catalog_file_path, parameters)
from radseq_analysis.file_handler.catalog import get_info_from_catalog
from radseq_analysis.file_handler.catalog import get_haplotypes
from radseq_analysis.file_handler.individual_files import get_individual_sequences
from radseq_analysis.file_handler.popmap import load_popmap
from radseq_analysis.file_handler.positions import load_positions_list
from radseq_analysis.file_handler.sequences import get_sequences
from radseq_analysis.file_handler.markers import get_markers
from radseq_analysis.file_handler.coverage import get_coverage
from radseq_analysis.file_handler.logs import get_individuals_order
from collections import defaultdict
from radseq_analysis.file_handler.file_open import open_all
def get_info_from_catalog(catalog_path,
loci_list=None,
consensus=True,
correspondance=False,
frequencies=False):
'''
Extract consensus sequences and loci ID correspondance for each individual
from the catalog file.
Input:
- path to a catalog file (batch_X.catalog.tsv)
Output (any combination of the following):
1) ID correspondance for each locus and each individual:
{ Catalog locus ID: { Individual name: individual locus ID } }
2) Consensus sequences:
{ Catalog locus ID: consensus sequence }
3) Haplotypes frequencies table:
{ Number of individuals with haplotype: count }
'''
catalog = open_all(catalog_path)
catalog.readline()
if correspondance:
correspondance_data = defaultdict(lambda: defaultdict())
if consensus:
consensus_data = dict()
if frequencies:
frequencies_data = defaultdict(int)
for line in catalog:
tabs = line.split('\t')
locus_id = tabs[2]
if frequencies:
# Count each individual only once - sometimes they appear more than once ...
n_indivs = len({i.split('_')[0] for i in tabs[8].split(',')})
frequencies_data[n_indivs] += 1
if consensus or correspondance:
if (loci_list and locus_id in loci_list) or not loci_list:
if consensus:
consensus_data[locus_id] = tabs[9]
if correspondance:
indiv_ids = tabs[8].split(',')
for individual in indiv_ids:
temp = individual.split('_')
correspondance_data[temp[0]][temp[1]] = locus_id
catalog.close()
if consensus and correspondance and frequencies:
return consensus_data, correspondance_data, frequencies_data
elif consensus and correspondance:
return consensus_data, correspondance_data
elif consensus and frequencies:
return consensus_data, frequencies_data
elif correspondance and frequencies:
return correspondance_data, frequencies_data
elif consensus:
return consensus_data
elif correspondance:
return correspondance_data
elif frequencies:
return frequencies_data
def get_haplotypes(parameters, sequence=False, correspondance=False, numbers=True):
'''
Extract haplotypes information, sorted by sex, from a catalog file
Input:
- path to a catalog file (batch_X.catalog.tsv)
Output:
- for each locus, haplotype for each individual
{ Locus ID: { sequence: sequence, individuals: { individual_id: individual_locus_id } }}
'''
catalog = open_all(parameters.catalog_file_path)
catalog.readline()
haplotypes_data = {}
for line in catalog:
tabs = line.split('\t')
locus_id = tabs[2]
indiv_ids = tabs[8].split(',')
haplotypes_data[locus_id] = {'n_males': 0, 'n_females': 0}
temp = [[individual.split('_')[0], individual.split('_')[1]] for individual in indiv_ids]
if correspondance:
haplotypes_data[locus_id]['individuals'] = {t[0]: t[1] for t in temp}
else:
haplotypes_data[locus_id]['individuals'] = [t[0] for t in temp]
if numbers:
haplotypes_data[locus_id]['n_males'] = [parameters.popmap[parameters.order[i[0]]] for i in temp].count('M')
haplotypes_data[locus_id]['n_females'] = [parameters.popmap[parameters.order[i[0]]] for i in temp].count('F')
if sequence:
haplotypes_data[locus_id]['sequence'] = tabs[9]
catalog.close()
return haplotypes_data
import statistics
from radseq_analysis.file_handler.file_open import open_all
def get_coverage(coverage_file_path):
'''
Extract coverage information from a coverage file.
Coverage file has the following structure:
Barcode | Individual | Total reads | Retained reads
Output structure:
{ Individual : Correction }
'''
coverage_data = {}
temp = {}
coverage_file = open_all(coverage_file_path)
for line in coverage_file:
fields = line[:-1].split('\t')
temp[fields[1]] = int(fields[3])
mean_coverage = statistics.mean(temp.values())
for individual, reads in temp.items():
coverage_data[individual] = reads / mean_coverage
return coverage_data
import gzip
def open_all(file_path):
if file_path.endswith('.gz'):
file = gzip.open(file_path, 'rt')
else:
file = open(file_path)
return file
from collections import defaultdict
from radseq_analysis.shared.commons import *
from radseq_analysis.file_handler.file_open import open_all
def get_haplotypes(parameters, haplotypes=True, numbers=True):
'''
Extract haplotypes information, sorted by sex, from a haplotypes file
Input:
- path to a haplotypes file (batch_X.haplotypes.tsv)
Output:
- for each locus, haplotype for each individual
{ Locus ID: { Individual : Haplotype }}
- for each locus, male and female count of each haplotype:
{ Locus ID: { Haplotype: { Males: N, Females: M } } }
'''
haplotypes_file = open_all(parameters.haplotypes_file_path)
line = haplotypes_file.readline()
names = line[:-1].split('\t')[2:]
if haplotypes:
haplotypes_data = dict()
if numbers:
haplotypes_numbers = dict()
# Sort individual haplotypes by sex for each catalog haplotype
for line in haplotypes_file:
tabs = line[:-1].split('\t')
locus_id = tabs[0]
temp = {names[i]: seq for i, seq in enumerate(tabs[2:])}
if haplotypes:
haplotypes_data[locus_id] = temp
if numbers:
tags = defaultdict(lambda: {MALES: 0, FEMALES: 0})
for individual, haplotype in temp.items():
if individual in parameters.popmap.keys():
if parameters.popmap[individual] is 'M':
tags[haplotype][MALES] += 1
elif parameters.popmap[individual] is 'F':
tags[haplotype][FEMALES] += 1
haplotypes_numbers[locus_id] = tags
if haplotypes:
if numbers:
return haplotypes_data, haplotypes_numbers
else:
return haplotypes_data
elif numbers:
return haplotypes_numbers
from collections import defaultdict
from radseq_analysis.file_handler.file_open import open_all
def get_individual_sequences(individual_file_path, loci_to_extract=None):
# Open file, read 2nd line, extract individual number and reset to 2nd line
individual_file = open_all(individual_file_path, 'rt')
individual_file.readline()
individual_number = individual_file.readline().split('\t')[1]
individual_file.close()
individual_file = open_all(individual_file_path, 'rt')
individual_file.readline()
individual_data = defaultdict(int)
for line in individual_file:
tabs = line.split('\t')
individual_locus_id = tabs[2]
if (not loci_to_extract or individual_locus_id in
loci_to_extract[individual_number].keys()):
catalog_id = loci_to_extract[individual_number][individual_locus_id]
sequence_name = tabs[6]
if tabs[7] != '':
sequence_name += '_' + tabs[7]
if sequence_name in ['model', 'consensus']:
pass
else:
individual_data[catalog_id] += 1
return individual_data
from radseq_analysis.file_handler.file_open import open_all
def get_individuals_order(log_path):
'''
Parse the denovo_map.log to find the order in which individuals were processed,
which corresponds to the order of the individuals in the catalog.
'''
logs = open_all(log_path)
individuals_order = {}
line = logs.readline()
found = False
while line != 'Depths of Coverage for Processed Samples:\n' and line:
line = logs.readline()
if line == 'Depths of Coverage for Processed Samples:\n':
found = True
if not found:
print('Error: could not detect individuals order in the denovo_map log file.\n')
print('Possible explanations: \n')
print('\t- you are using an old version of Stacks\n')
print('\t- Stacks did not run properly and denovo_map.log is incomplete\n')
print('\t- a newer version of Stacks has changed the log file structure\n')
order = 1
while True:
line = logs.readline()
if line == '\n':
break
individuals_order[str(order)] = line.split(':')[0]
order += 1
return individuals_order
from radseq_analysis.file_handler.file_open import open_all
def get_markers(markers_file_path):
'''