Author: Stephen Doyle, stephen.doyle[at]sanger.ac.uk
export PATH="/nfs/users/nfs_s/sd21/lustre118_link/software/anaconda2/pkgs/augustus-3.1-0/bin:$PATH"
export PATH="/nfs/users/nfs_s/sd21/lustre118_link/software/anaconda2/pkgs/augustus-3.1-0/scripts:$PATH"
export AUGUSTUS_CONFIG_PATH="/nfs/users/nfs_s/sd21/lustre118_link/software/anaconda2/pkgs/augustus-3.1-0/config"
bsub.py --threads 20 --queue long 20 busco_tt_new "busco -i trichuris_trichiura.fa -l /nfs/users/nfs_s/sd21/lustre118_link/databases/busco/metazoa_odb9 -o new_assembly_metazoa --mode genome --long -sp caenorhabditis -f --cpu 20"
bsub.py --threads 20 --queue long 20 busco_tt_old "busco -i trichuris_trichiura.PRJEB535.WBPS12.genomic.fa -l /nfs/users/nfs_s/sd21/lustre118_link/databases/busco/metazoa_odb9 -o old_assembly_metazoa --mode genome --long -sp caenorhabditis -f --cpu 20"
conda activate py37
pip3 install liftoff --user
conda install -c bioconda minimap2
wget ftp://ftp.ebi.ac.uk/pub/databases/wormbase/parasite/releases/WBPS15/species/trichuris_trichiura/PRJEB535/trichuris_trichiura.PRJEB535.WBPS15.annotations.gff3.gz
wget ftp://ftp.ebi.ac.uk/pub/databases/wormbase/parasite/releases/WBPS15/species/trichuris_trichiura/PRJEB535/trichuris_trichiura.PRJEB535.WBPS15.genomic.fa.gz
liftoff -g trichuris_trichiura.PRJEB535.WBPS15.annotations.gff3.gz trichuris_trichiura.fa trichuris_trichiura.PRJEB535.WBPS15.genomic.fa -o newversion.gff -u unmapped_features.txt
wc -l unmapped_features.txt
#> 1199 unmapped_features.txt
awk $3="gene" '{print}' newversion.gff
#> 8451 genes
# to be cross-compatible with Apollo, which has the same genome version but with updated scaffold names (changed by Faye after I had started the analysis), had to run the following on the genome and annotation
perl rename_scaffolds.pl trichuris_trichiura.fa > trichuris_trichiura.renamed.fa
perl rename_scaffolds.pl newversion.gff > newversion.renamed.gff
# where "rename_scaffolds.pl" is:
#!/usr/bin/env perl
use strict;
use warnings;
while (<>){
s/Trichuris_trichiura/TTRE/;
s/_00_/_unplaced_/;
s/_1_/_chr1_/;
s/_2_001/_chr2/;
s/_3_/_chr3_/;
s/_[0]+/_scaffold/;
print $_;
}
# found that, becasue the original version has been annotated interpro descriptions etc, the liftover has misformatted some of the lines (in the same way that apollo dumps of the Haemonchus annotations were misformatted). To fix, ran the following:
cat newversion.renamed.gff | while read LINE; do
if [[ $LINE =~ (^TTRE*|\#) ]]; then
echo -ne "\n${LINE} ";
else
echo -n "${LINE} ";
fi;
done > newversion.renamed.v2.gff3
cat newversion.gff | while read LINE; do
if [[ $LINE =~ (^Trichuris*|\#) ]]; then
echo -ne "\n${LINE} ";
else
echo -n "${LINE} ";
fi;
done > newversion.v2.gff3
echo -e "##gff-version 3" > UPDATED_annotation.gff3
cat newversion.renamed.v2.gff3 | sort -k1,1 -k4,4n | sed '/^$/d' >> UPDATED_annotation.gff3
8451 genes
# extract protein sequences
gffread <(zcat trichuris_trichiura.PRJEB535.WBPS15.annotations.gff3.gz) -g trichuris_trichiura.PRJEB535.WBPS15.genomic.fa -y old_proteins.fa
gffread <(zcat trichuris_trichiura.PRJEB535.WBPS15.annotations.gff3.gz) -g trichuris_trichiura.PRJEB535.WBPS15.genomic.fa -x old_cds.fa
# fix line lengths
fastaq to_fasta -l0 old_proteins.fa tmp; mv tmp old_proteins.fa
fastaq to_fasta -l0 old_cds.fa tmp; mv tmp old_cds.fa
# extract protein sequences for the unmapped genes
cat unmapped_features.txt | sed 's/'gene:'//g' | while read -r GENE; do
grep -A1 ${GENE} old_proteins.fa >> unmapped_proteins.fa;
done
cat unmapped_features.txt | sed 's/'gene:'//g' | while read -r GENE; do
grep -A1 ${GENE} old_cds.fa >> unmapped_cds.fa;
done
~sd21/bash_scripts/run_exonerate_splitter trichuris_trichiura.fa unmapped_proteins.fa
cat split_exonerate*out | Exonerate_to_evm_gff3.pl - > merged_exonerate.output
rm split_exonerate*out
rm x*
rm run_split*
# load diamond
module load diamond/0.9.24--ha888412_1
diamond blastp \
--db /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/uniprot_ref_proteomes.dmnd \
--query unmapped_proteins.fa \
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids \
--taxonnodes /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/nodes.dmp \
--tmpdir /dev/shm \
--taxonmap /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/prot.accession2taxid \
--max-target-seqs 1 > unmapped_proteins.diamond.out
diamond blastx \
--db /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/uniprot_ref_proteomes.dmnd \
--query unmapped_cds.fa \
--outfmt 6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids \
--taxonnodes /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/nodes.dmp \
--tmpdir /dev/shm \
--taxonmap /nfs/users/nfs_s/sd21/lustre118_link/databases/diamond/prot.accession2taxid \
--max-target-seqs 1 > unmapped_cds.diamond.out
cut -f4 unmapped_proteins.diamond.out | sort | uniq -c | sort -k1nr
204 not in uniprot refseq database
cut -f13 unmapped_cds.diamond.out | sort | uniq -c | sort -k1nr
# make a proteins file from annotation
gffread -y PROTEINS.fa -g trichuris_trichiura.fa liftover_annotation.gff3
# remove stop codons from protein sequences - IPS doesnt like it
sed -e 's/\.//g' PROTEINS.fa > tmp; mv tmp PROTEINS.fa
# load module
module load interproscan/5.39-77.0-W01
# run IPS
farm_interproscan -a PROTEINS.fa -o IPS.output.gff
# lift over GO terms from interproscan to GFF
extract_interproscan_go_terms -i IPS.output.gff -e liftover_annotation.gff3
# extract GO terms from mRNAs in GFF
awk '$3=="mRNA" {print $0}' OFS="\t" liftover_annotation.gff3.go.gff | grep "Ontology_term" | cut -f 9 | cut -f3,4 -d ";" | sed -e 's/;/\t/g' -e 's/Name=//g' -e 's/"//g' -e 's/Ontology_term=//g' -e 's/-.*\t/\t/g' -e 's/,/\t/g' -e 's/\..*\t/\t/g' > annotation_GO_per_gene.txt
# work through columns to split multiple go terms per gene into one term per gene, repeating the gene name if multiple GO terms present
awk '{for(i=2; i<=NF; i++) {print $1,$i}}' OFS="\t" annotation_GO_per_gene.txt > annotation_GO_per_gene_split.txt