Commit 9f4dda23 authored by eboulanger's avatar eboulanger
Browse files

revision select CDS cnon-synonymous SNP

parent 1ed5b6cc
## Diplodus
#1 vcf input
VCF='diplodus_CDS.vcf'
#2 genome fasta
GENOME_FAS='genomes/diplodus_genome_lgt6000.fasta'
#3 genome gff3 (annotation)
GFF3='annotation/sar_annotation.gff3'
#4 prefix
PREFIX='diplodus_CDS'
## Mullus
#1 vcf input
VCF='mullus/mullus_CDS.vcf'
#2 genome fasta
GENOME_FAS='genomes/mullus_genome_lgt6000.fasta'
#3 genome gff3 (annotation)
GFF3='annotation/mullus_annotation.gff3'
#4 prefix
PREFIX='mullus/mullus_CDS'
## run anvage with file specs above
anvage synonymous --vcf ${VCF} \
--genome ${GENOME_FAS} \
--annotation ${GFF3} \
--output_prefix ${PREFIX}
\ No newline at end of file
......@@ -16,4 +16,18 @@ grep -v "^#" $4"_uniprot_nonsynonymous.recode.vcf" | awk '{ print $1"\t"$2}' | w
}
nonsynonymous_chrompos diplodus_CDS.vcf genomes/diplodus_genome_lgt6000.fasta annotation/sar_annotation.gff3 diplodus_CDS diplodus_CDS_uniprot_blast_filtered.tsv
nonsynonymous_chrompos mullus_CDS.vcf genomes/mullus_genome_lgt6000.fasta annotation/mullus_annotation.gff3 mullus_CDS mullus_CDS_uniprot_blast_filtered.tsv
\ No newline at end of file
nonsynonymous_chrompos mullus_CDS.vcf genomes/mullus_genome_lgt6000.fasta annotation/mullus_annotation.gff3 mullus_CDS mullus_CDS_uniprot_blast_filtered.tsv
# to compare lists of SNP
## Diplodus
# get CDS positions
grep -v "^##" "diplodus/diplodus_CDS.vcf" | cut -f1-2 | sed '1d' > "diplodus/diplodus_CDS_pos.txt"
# get pcadapt outlier positions
grep -v "^##" "../../seaConnect--dataPrep/04-finalPrep/01-Diplodus/dip_adaptive_413.vcf" | cut -f1-2 | sed '1d' > "diplodus/diplodus_pcadapt_pos.txt"
## Mullus
# get CDS positions
grep -v "^##" "mullus/mullus_CDS.vcf" | cut -f1-2 | sed '1d' > "mullus/mullus_CDS_pos.txt"
# get pcadapt outlier positions
grep -v "^##" "../../seaConnect--dataPrep/04-finalPrep/02-Mullus/mul_adaptive_291.vcf" | cut -f1-2 | sed '1d' > "mullus/mullus_pcadapt_pos.txt"
SINGULARITY_SIMG=snpsdata_analysis.simg
singularity shell $SINGULARITY_SIMG
## select SNPs on an exon|CDS|mRNA|gene
intersect_vcf_annotation () {
#1 output prefix
#2 vcf
#3 gff3
#4 exon|CDS|mRNA|gene
#1 output prefix
#2 intput vcf
#3 gff3 genome annotation
#4 exon|CDS|mRNA|gene
bedtools intersect -wb -a $2 -b $3 | grep $4 | rev | cut -f10- | rev | uniq > $1_$4.bed
wait;
cat <(grep "^#" $2) $1_$4.bed > $1_$4.vcf
}
# diplodus sargus
# diplodus sargus (only on outlier SNPs)
for annot in `echo "exon CDS mRNA gene"`; do intersect_vcf_annotation "diplodus" "raw/dip_adaptive_413.vcf" "annotation/sar_annotation.gff3" ${annot} ; done
# mullus surmuletus
# mullus surmuletus (only on outlier SNPs)
for annot in `echo "exon CDS mRNA gene"`; do intersect_vcf_annotation "mullus" "raw/mul_adaptive_291.vcf" "annotation/mullus_annotation.gff3" ${annot} ; done
## Diplodus
# full: separate all filtered SNPs into gene, mRNA, CDS and exon
for annot in `echo "exon CDS mRNA gene"`; do intersect_vcf_annotation "diplodus/diplodus" "filtered/dip_all_filtered.vcf" "annotation/sar_annotation.gff3" ${annot} ; done
# extract non-gene SNP
# first get list postitions gene SNP
grep -v "^##" "diplodus_gene.vcf" | cut -f1-2 | sed '1d' > dip_gene_pos.txt
## remove gene SNP from total (filtered) vcf
vcftools --vcf "filtered/dip_all_filtered.vcf" --exclude-positions "dip_gene_pos.txt" --recode --recode-INFO-all --out "dip_nongene"
## Mullus
# full: separate all filtered SNPs into gene, mRNA, CDS and exon
for annot in `echo "exon CDS mRNA gene"`; do intersect_vcf_annotation "mullus/mullus" "filtered/mul_all_filtered.vcf" "annotation/mullus_annotation.gff3" ${annot} ; done
# extract non-gene SNP
# first get list postitions gene SNP
grep -v "^##" "mullus/mullus_gene.vcf" | cut -f1-2 | sed '1d' > "mullus/mullus_gene_pos.txt"
## remove gene SNP from total (filtered) vcf
vcftools --vcf "filtered/mul_all_filtered.vcf" --exclude-positions "mullus/mullus_gene_pos.txt" --recode --recode-INFO-all --out "mullus/mullus_nongene"
mv "mullus/mullus_nongene.recode.vcf" "mullus/mullus_nongene.vcf"
\ No newline at end of file
## subset snps by type (nongene, gene, mRNA, CDS...)
bash locate_coding_snps.sh
## on CDS (in this instance) subset nonynonymous/synonymous
bash generate_non_synonymous.sh
# count the # of SNPs each generated file
awk '!/#/' mullus/mullus_CDS_nonsynonymous.vcf | wc -l
awk '!/#/' diplodus/diplodus_CDS_nonsynonymous.vcf | wc -l
## seek known uniprot reference on the genome flanking sequence of non synonymous SNPs
#### get flanking region of non synonymous SNPs
i="diplodus"
anvage flank -f ${i}_CDS_nonsynonymous.vcf -g genomes/${i}_genome_lgt6000.fasta -w 200 -o ${i}_CDS_nonsynonymous;
#### blast flanking sequences against uniprot
blastx -db swissprot_nucl -query "diplodus_CDS_nonsynonymous_flanking.fasta" -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > diplodus_CDS_nonsynonymous_uniprot.blastout
\ No newline at end of file
......@@ -12,7 +12,8 @@ colonne_rank_query () {
done
}
## blast flanking sequences against uniprot references
## convert blast output into a nice table
for orgn in `echo diplodus mullus`;
do for typeanot in `echo CDS` ;
do blastx -db swissprot_nucl -query ${orgn}_${typeanot}_flanking.fasta -task blastx -outfmt "7 delim=, qacc sacc evalue bitscore qcovus pident" -max_target_seqs 5 > ${orgn}_${typeanot}_uniprot_blastx.out;
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment