Commit 207017a8 authored by peguerin's avatar peguerin
Browse files

cleaning

parent 25961a2b
from Bio import SeqIO
import sys
import numpy
input = open(sys.argv[1], 'rU')
seqlength=[]
for record in SeqIO.parse(input, "fasta"):
bp = len(record.seq)
seqlength.append(bp)
seqlength=sorted(seqlength)
#print "max sequence length = {0}".format(seqlength[-1])
unique =[]
for entry in seqlength:
if not entry in unique:
unique.append(entry)
n50_u=[]
for entry in unique:
multiplier = seqlength.count(entry) * entry
for i in range(multiplier):
n50_u.append(entry)
n_50=[]
for entry in n50_u:
if n50_u > 150:
index = len(n50)/2
avg = []
if index % 2 == 0:
first = n50[index -1]
second = n50[index]
avg.append(first)
avg.append(second)
n50= numpy.mean(avg)
print "N50 = {0}".format(n50)
else:
print "N50 = {0}".format(n50[index-1])
input.close()
from Bio import SeqIO
import sys
import numpy
input = open(sys.argv[1], 'rU')
seqlength=[]
for record in SeqIO.parse(input, "fasta"):
bp = len(record.seq)
seqlength.append(bp)
seqlength=sorted(seqlength)
#print "max sequence length = {0}".format(seqlength[-1])
unique =[]
for entry in seqlength:
if not entry in unique:
unique.append(entry)
n50=[]
for entry in unique:
multiplier = seqlength.count(entry) * entry
for i in range(multiplier):
n50.append(entry)
index = len(n50)/2
avg = []
if index % 2 == 0:
first = n50[index -1]
second = n50[index]
avg.append(first)
avg.append(second)
n50= numpy.mean(avg)
print "N50 = {0}".format(n50)
else:
print "N50 = {0}".format(n50[index-1])
input.close()
#basic stats on genome assembly fasta file
INPUT=$1
#calculer le N50 et le plus long contig
python calculateN50.py $INPUT
#avoir la liste des tailles des contigs
python getseqlength.py $INPUT > monfichierdesseqslength
#nombre de scaffolds
echo "nombre de scaffolds"
grep -c '>' $INPUT
#taille totale en nucleotide
echo "taille totale en nucleotide"
grep -v '>' | wc -c
genome assembly Diplodus sargus
====================================
CONTIGS
n50=1101
max_length = 41733
min_length = 127
total number of nucleotides = 1228461534
total number of contigs = 2408078
SCAFFOLDS
n50=2559112
max_length = 22745800
min_length = 100
total number of nucleotides = 862322655
total number of scaffolds = 221585
tout les scaffolds dont la taille est égale ou infèrieure à 127 sont éliminés :
SCAFFOLDS (L > 127)
====================================
CONTIGS
N50= 1101
number of upper N50 contigs = 228679
number of below N50 contigs = 2179399
minimum sequence length = 127
maximum sequence length = 41733
total number of nucleotides = 1305742195
total number of contigs = 2408078
====================================
SCAFFOLDS
N50=2559112
number of upper N50 contigs = 70
number of below N50 contigs = 221515
minimum sequence length = 100
maximum sequence length = 22692753
total number of nucleotides = 876400605
total number of scaffolds = 221585
====================================
SCAFFOLDS_GAPCLOSE
N50 = 2492796
number of upper N50 scaffolds = 71
number of below N50 scaffolds = 221514
minimum sequence length = 100
maximum sequence length = 22692753
total number of nucleotides = 876400605
total number of scaffolds = 221585
N50 = 2492796
minimum sequence length = 100
maximum sequence length = 22692753
total number of scaffolds = 221585
number of upper N50 scaffolds = 71
number of below N50 scaffolds = 221514
library(ggplot2)
t=read.table("seqlength.txt")
st = sort(t$V1)
#st=sort(t$V1)[which(t$V1 > 127)]
#calcul du n50
st_cumsum = cumsum(st)
half_total_size = max(st_cumsum)/2
id_n50 = min(which(st_cumsum > half_total_size))
n50=st[id_n50]
#N50
n50
#max_length
max(st)st50 = st[which(st > n50 )]
rst50 = round(st50/1000000,1)
gg = data.frame(pos=1:length(st50),taille=rst50)
p <- ggplot(gg, aes(x=pos, y=taille))+
geom_bar(stat="identity", position=position_dodge())+
coord_flip()+
xlab("Scaffold")+
ylab("Scaffold Size (Mpb)")+
ggtitle("upper N50 contigs (filtered length > 10Kpb")+
theme_minimal()
p
#min_length
min(st)
#total number of nucleotides
max(st_cumsum)
#total number of contigs/scaffolds
length(st)
#distribution des scaffolds au dessus du n50
st50 = st[which(st > n50 )]
rst50 = round(st50/1000000,1)
gg = data.frame(pos=1:length(st50),taille=rst50)
p <- ggplot(gg, aes(x=pos, y=taille))+
geom_bar(stat="identity", position=position_dodge())+
coord_flip()+
xlab("Scaffold")+
ylab("Scaffold Size (Mpb)")+
ggtitle("upper N50 contigs (filtered length > 10Kpb)")+
theme_minimal()
p
ggsave("upperN50contigs_filtered10k.pdf",plot=last_plot())
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment