Author: Stephen Doyle, stephen.doyle[at]sanger.ac.uk
cd /nfs/users/nfs_s/sd21/lustre118_link/trichuris_trichiura/03_MAPPING/COV_STATS
# run coverage stats
bsub.py --queue long 5 cov "~sd21/bash_scripts/run_cov_stats 100000"
where “run_cov_stats” is:
##########################################################################################
# run_cov_stats
##########################################################################################
# Usage: ~sd21/bash_scripts/run_cov_stats < window size >
module load bamtools/2.5.1--he860b03_5
WINDOW=$1
for i in *.bam
do
bamtools header -in ${i} | grep "^@SQ" | awk -F'[:\t]' '{printf $3"\t"1"\t"$5"\n"}' OFS="\t" > ${i%.bam}.chr.bed
bamtools header -in ${i} | grep "^@SQ" | awk -F'[:\t]' '{printf $3"\t"$5"\n"}' OFS="\t" > ${i%.bam}.chr.genome
bedtools makewindows -g ${i%.bam}.chr.genome -w ${WINDOW} > ${i%.bam}.${WINDOW}_window.bed
samtools bedcov -Q 20 ${i%.bam}.chr.bed ${i} | awk -F'\t' '{printf $1"\t"$2"\t"$3"\t"$4"\t"$4/($3-$2)"\n"}' OFS="\t" > ${i%.bam}.chr.cov
samtools bedcov -Q 20 ${i%.bam}.${WINDOW}_window.bed ${i} | awk -F'\t' '{printf $1"\t"$2"\t"$3"\t"$4"\t"$4/($3-$2)"\n"}' OFS="\t" > ${i%.bam}.${WINDOW}_window.cov
rm ${i%.bam}.chr.bed ${i%.bam}.${WINDOW}_window.bed ${i%.bam}.chr.genome
done
for i in *.chr.cov; do
printf "${i}\n" > ${i}.tmp | awk '{print $5}' OFS="\t" ${i} >> ${i}.tmp;
done
paste *.tmp > coverage_stats.summary
rm *.tmp
# extract mtDNA and nuclear (mean & stddev) data
for i in *trimmed.chr.cov; do
name=${i%.trimmed.chr.cov};
nuc=$(grep -v "MITO" ${i%.trimmed.chr.cov}.100000_window.cov | datamash mean 5 sstdev 5 );
mtDNA=$(grep "MITO" ${i} | cut -f5 );
echo -e "${name}\t${nuc}\t${mtDNA}";
done > nuc_mtDNA_coverage.stat
# working dir: /nfs/users/nfs_s/sd21/lustre118_link/trichuris_trichiura/03_MAPPING/COV_STATS
# load libraries
library(tidyverse)
# function to plot coverage per individual sample
plot_cov <- function(data, title){
data <- read.table(data,header=F)
plot <- ggplot(data,aes(1:nrow(data),V5,col=V1)) +
geom_point() +
labs(title=title, x="Genomic position", y="Mean coverage per 100kb window") +
theme_bw() + theme(legend.position = "none") +
ylim(0,2*mean(data$V5))
print(plot)
}
# run function, reading in a specific sample, to plot coverage
plot_cov("MN_UGA_KAB_HS_003.100000_window.cov","MN_UGA_KAB_HS_003.100000_window.cov")
ggsave("MN_UGA_KAB_HS_003.100000_window.cov.png")
# nuclear to mitochondrial DNA coverage ratio
nucmito <- read.table("nuc_mtDNA_coverage.stats",header=F)
ggplot(nucmito,aes(V1,V4)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
labs(title = "Nuclear to mitochondrial genome coverage ratio", y = "Coverage Ratio")
ggsave("nuc_mito_cov_ratio.png")
### Make some genome wide coverage plots
# load libraries
library(tidyverse)
library(ggsci)
library(stringr)
# list file names
file_names <- list.files(path = "./",pattern = ".trimmed.100000_window.cov")
# load data using file names, and make a formatted data frame
data <- purrr::map_df(file_names, function(x) {
data <- read.delim(x, header = F, sep="\t")
data <- tibble::rowid_to_column(data, "NUM")
cbind(sample_name = gsub(".trimmed.100000_window.cov","",x), data)
})
colnames(data) <- c("sample_name", "NUM", "CHROM", "START", "END", "RAW_COVERAGE", "PROPORTION_COVERAGE")
# remove mitochondrial genome and unplaced scaffolds
data <- data[data$CHROM != "Trichuris_trichiura_MITO",]
data <- data %>% filter(!str_detect(CHROM, "Trichuris_trichiura_00_"))
# annotate sex linked scaffolds
data$SEX <- str_detect(data$CHROM,"Trichuris_trichiura_1_")
# plot boxplots and distributions of pairwise Fst analyses
ggplot(data, aes(NUM, PROPORTION_COVERAGE/(median(PROPORTION_COVERAGE)), col=SEX, group = sample_name)) +
geom_point(size=0.2) +
labs( x = "Genome position" , y = "Relative coverage per 100kb window") +
theme_bw() + theme(legend.position = "none", strip.text.x = element_text(size = 6)) +
facet_wrap(~sample_name, scales = "free_y")+scale_color_manual(values=rep(c("orange","blue"),2))+ theme_classic()
ggsave("plot_relative_genomewide_coverage_allsamples.png")
ggsave("plot_relative_genomewide_coverage_allsamples.pdf",height=10, width=20, useDingbats=FALSE)
# extract mtDNA and nuclear (mean & stddev) data
for i in *trimmed.chr.cov; do
name=${i%.trimmed.chr.cov};
autosome=$(grep "Trichuris_trichiura_2\|Trichuris_trichiura_3]" ${i%.trimmed.chr.cov}.100000_window.cov | datamash mean 5 sstdev 5 );
xchromosome=$(grep "Trichuris_trichiura_1" ${i%.trimmed.chr.cov}.100000_window.cov | datamash mean 5 sstdev 5 );
echo -e "${name}\t${autosome}\t${xchromosome}";
done > autosome_to_Xchromsome_cov.stat
library(tidyverse)
cov_data <- read.table("autosome_to_Xchromsome_cov.stat")
sample_type <- read.table("single_v_pooled_sampletype.txt")
data <- left_join(cov_data,sample_type,by="V1")
colnames(data) <- c("sample_name", "autosome_cov_mean", "autosome_cov_sd", "x_cov_mean", "x_cov_sd", "sample_type" )
ggplot(data, aes(sample_name, x_cov_mean/autosome_cov_mean, col=sample_type)) +
geom_point() +
labs(title="X-to-autosomal coverage ratio", x="Sample name", y="X-to-autosomal coverage ratio") +
theme_bw() + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ylim(0.35,1.2) +
coord_flip()
ggsave("plot_x-to-autosome_ratio_sexdet.png")
ggsave("plot_x-to-autosome_ratio_sexdet.pdf",height=10, width=7, useDingbats=FALSE)