Author: Stephen Doyle, stephen.doyle[at]sanger.ac.uk
# working directory
/nfs/users/nfs_s/sd21/lustre118_link/trichuris_trichiura/05_ANALYSIS/PCA
# load libraries
library(tidyverse)
library(gdsfmt)
library(SNPRelate)
library(ggsci)
snpgdsClose(genofile)
vcf.in <- "mito_samples3x_missing0.8.recode.vcf.gz"
gds <- snpgdsVCF2GDS(vcf.in, "mtDNA_1.gds", method="biallelic.only")
genofile <- snpgdsOpen(gds)
pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = F)
samples <- as.data.frame(pca$sample.id)
colnames(samples) <- "name"
metadata <- samples %>% separate(name,c("time", "country","population","host","sampleID"))
metadata <- metadata %>%
mutate(SAMPLE_AGE = ifelse(time == "AN", "Ancient",
ifelse(host == "HS", "Modern Human", "Modern Animal")))
data <-
data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1],
EV2 = pca$eigenvect[,2],
EV3 = pca$eigenvect[,3],
EV4 = pca$eigenvect[,4],
EV5 = pca$eigenvect[,5],
EV6 = pca$eigenvect[,6],
TIME = metadata$SAMPLE_AGE,
COUNTRY = metadata$country,
POPULATION = metadata$population,
HOST = metadata$host,
stringsAsFactors = FALSE)
country_colours <-
c("CHN" = "#00A087",
"CMR" = "#902F21",
"DNK" = "#3C5488",
"ESP" = "#E7EAF0",
"HND" = "#4DBBD5",
"NLD" = "#9DAAC4",
"UGA" = "#E64B35",
"LTU" = "#0F1522",
"TZA" = "#F2A59A")
plot_pca <-
ggplot(data, aes(EV1, EV2, col = COUNTRY, shape = TIME, label = COUNTRY)) +
geom_text(size=4) +
theme_bw() +
labs(title="mito_samples3x_missing0.8",
x = paste0("PC1 variance: ",round(pca$varprop[1]*100,digits=2),"%"),
y = paste0("PC2 variance: ",round(pca$varprop[2]*100,digits=2),"%"))+
scale_colour_manual(values = country_colours)
plot_pca
ggsave("plot_PCA_mito_samples3x_missing0.8.png")
# use this to label with sample names
#ggplot(tab,aes(EV1,EV2, col = COUNTRY, shape = TIME, label = COUNTRY)) + geom_point(size=4)
vcftools \
--gzvcf Trichuris_trichiura.cohort.mito_variants.final.recode.vcf.gz \
--keep mtDNA_3x_noLF.list \
--max-missing 0.8 \
--recode --recode-INFO-all \
--out mito_samples3x_missing0.8_noLF
gzip -f mito_samples3x_missing0.8_noLF*
#> After filtering, kept 48 out of 61 Individuals
#> After filtering, kept 1291 out of a possible 1888 Sites
# load libraries
library(tidyverse)
library(gdsfmt)
library(SNPRelate)
library(ggsci)
library(patchwork)
snpgdsClose(genofile)
vcf.in <- "mito_samples3x_missing0.8_noLF.recode.vcf.gz"
gds <- snpgdsVCF2GDS(vcf.in, "mtDNA.gds", method="biallelic.only")
genofile <- snpgdsOpen(gds)
pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = F)
# Working space: 56 samples, 802 SNPs
samples <- as.data.frame(pca$sample.id)
colnames(samples) <- "name"
metadata <- samples %>% separate(name,c("time", "country","population","host","sampleID"))
metadata <- metadata %>%
mutate(SAMPLE_AGE = ifelse(time == "AN", "Ancient",
ifelse(host == "HS", "Modern Human", "Modern Animal")))
data <-
data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1],
EV2 = pca$eigenvect[,2],
EV3 = pca$eigenvect[,3],
EV4 = pca$eigenvect[,4],
EV5 = pca$eigenvect[,5],
EV6 = pca$eigenvect[,6],
TIME = metadata$SAMPLE_AGE,
COUNTRY = metadata$country,
POPULATION = metadata$population,
HOST = metadata$host,
stringsAsFactors = FALSE)
country_colours <-
c("CHN" = "#00A087",
"CMR" = "#902F21",
"DNK" = "#3C5488",
"ESP" = "#E7EAF0",
"HND" = "#4DBBD5",
"NLD" = "#9DAAC4",
"UGA" = "#E64B35",
"LTU" = "#0F1522",
"TZA" = "#F2A59A")
plot_pca_mito <-
ggplot(data, aes(EV1, EV2, col = COUNTRY, shape = TIME, label = COUNTRY),alpha=1) +
geom_rect(aes(xmin=-0.125, ymin=-0.01, xmax=-0.05, ymax=0.025), fill=NA, col="black", linetype="dotted", size=0.3) +
geom_point(size=4, alpha=1) +
theme_bw() +
labs(x = paste0("PC1 variance: ",round(pca$varprop[1]*100,digits=2),"%"),
y = paste0("PC2 variance: ",round(pca$varprop[2]*100,digits=2),"%")) +
scale_colour_manual(values = country_colours)
plot_pca_mito_zoom <-
ggplot(data, aes(EV1,EV2, col = COUNTRY, shape = TIME, label = paste0(TIME,"_",COUNTRY,"_",POPULATION,"_",HOST))) +
geom_point(size=4, alpha=1) +
theme_bw() +
labs(x = "PC1",
y = "PC2") +
xlim(-0.125,-0.05) + ylim(-0.01, 0.025) +
scale_colour_manual(values = country_colours)
# note: geom_rect in first plot and zoom coordinates were adjusted manually to show the cluster clearly.
plot_pca_mito + plot_pca_mito_zoom + plot_layout(ncol=2, guides = "collect")
ggsave("plot_PCA_mito_samples3x_missing0.8_noLF.png")
ggsave("plot_PCA_mito_samples3x_missing0.8_noLF.pdf", height = 5, width = 11, useDingbats=FALSE)
Figure: plot_PCA_mito_samples3x_missing0.8_noLF
# extract nuclear variants
# vcftools \
# --gzvcf Trichuris_trichiura.cohort.nuclear_variants.final.recode.vcf.gz \
# --keep nuclear_3x_animal.list \
# --max-missing 0.8 \
# --recode --recode-INFO-all \
# --out nuclear_samples3x_missing0.8
# post peer review - restricted to autosomes only, excluding sex chromosomes
vcftools \
--gzvcf Trichuris_trichiura.cohort.nuclear_variants.final.recode.vcf.gz \
--keep nuclear_3x_animal.list \
--max-missing 0.8 \
--recode --recode-INFO-all \
--bed trichuris_trichiura.autosomeLG.bed \
--out nuclear_samples3x_missing0.8
gzip -f nuclear_samples3x_missing0.8.recode.vcf
#> After filtering, kept 36 out of 61 Individuals
#> After filtering, kept 1850621 out of a possible 6933531 Sites
# load libraries
library(tidyverse)
library(gdsfmt)
library(SNPRelate)
library(patchwork)
snpgdsClose(genofile)
vcf.in <- "nuclear_samples3x_missing0.8.recode.vcf.gz"
gds<-snpgdsVCF2GDS(vcf.in, "nuclear_1.gds", method="biallelic.only")
genofile <- snpgdsOpen(gds)
pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = F)
# Working space: 36 samples, 1,675,392 SNPs
samples <- as.data.frame(pca$sample.id)
colnames(samples) <- "name"
metadata <- samples %>% separate(name,c("time", "country","population","host","sampleID"))
metadata <- metadata %>%
mutate(SAMPLE_AGE = ifelse(time == "AN", "Ancient",
ifelse(host == "HS", "Modern Human", "Modern Animal")))
data <-
data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1],
EV2 = pca$eigenvect[,2],
EV3 = pca$eigenvect[,3],
EV4 = pca$eigenvect[,4],
EV5 = pca$eigenvect[,5],
EV6 = pca$eigenvect[,6],
TIME = metadata$SAMPLE_AGE,
COUNTRY = metadata$country,
POPULATION = metadata$population,
HOST = metadata$host,
stringsAsFactors = FALSE)
country_colours <-
c("CHN" = "#00A087",
"CMR" = "#902F21",
"DNK" = "#3C5488",
"ESP" = "#E7EAF0",
"HND" = "#4DBBD5",
"NLD" = "#9DAAC4",
"UGA" = "#E64B35",
"LTU" = "#0F1522",
"TZA" = "#F2A59A")
plot_pca_nuc <-
ggplot(data,aes(EV1, EV2, col = COUNTRY, shape = TIME, label = HOST)) +
geom_point(size=4) +
geom_text_repel(size=4) +
theme_bw() +
labs(title="nuclear_samples3x_missing0.8",
x = paste0("PC1 variance: ",round(pca$varprop[1]*100,digits=2),"%"),
y = paste0("PC2 variance: ",round(pca$varprop[2]*100,digits=2),"%"))+
scale_colour_manual(values = country_colours)
plot_pca_nuc
ggsave("plot_PCA_nuclear_samples3x_missing0.8.png")
ggsave("plot_PCA_nuclear_samples3x_missing0.8.pdf", height=5, width=5)
# vcftools \
# --gzvcf Trichuris_trichiura.cohort.nuclear_variants.final.recode.vcf.gz \
# --keep nuclear_3x_animalPhonly.list \
# --max-missing 0.8 \
# --recode --recode-INFO-all \
# --out nuclear_samples3x_missing0.8_animalPhonly
# post peer review - restricted to autosomes only, excluding sex chromosomes
vcftools \
--gzvcf Trichuris_trichiura.cohort.nuclear_variants.final.recode.vcf.gz \
--keep nuclear_3x_animalPhonly.list \
--max-missing 0.8 \
--recode --recode-INFO-all \
--bed trichuris_trichiura.autosomeLG.bed \
--out nuclear_samples3x_missing0.8_animalPhonly
gzip -f nuclear_samples3x_missing0.8_animalPhonly.recode.vcf
#> After filtering, kept 31 out of 61 Individuals
#> After filtering, kept 1963868 out of a possible 6933531 Sites
# load libraries
library(tidyverse)
library(gdsfmt)
library(SNPRelate)
library(ggsci)
snpgdsClose(genofile)
vcf.in <- "nuclear_samples3x_missing0.8_animalPhonly.recode.vcf.gz"
gds <- snpgdsVCF2GDS(vcf.in, "nuclear.gds", method="biallelic.only")
genofile <- snpgdsOpen(gds)
pca <- snpgdsPCA(genofile, num.thread=2, autosome.only = F)
# Working space: 31 samples, 2,544,110 SNPs
samples <- as.data.frame(pca$sample.id)
colnames(samples) <- "name"
metadata <- samples %>% separate(name,c("time", "country","population","host","sampleID"))
# fix the metadata so can get the right shapes to match the main sampling map
metadata <- metadata %>%
mutate(SAMPLE_AGE = ifelse(time == "AN", "Ancient",
ifelse(host == "HS", "Modern Human", "Modern Animal")))
data <-
data.frame(sample.id = pca$sample.id,
EV1 = pca$eigenvect[,1],
EV2 = pca$eigenvect[,2],
EV3 = pca$eigenvect[,3],
EV4 = pca$eigenvect[,4],
EV5 = pca$eigenvect[,5],
EV6 = pca$eigenvect[,6],
TIME = metadata$SAMPLE_AGE,
COUNTRY = metadata$country,
POPULATION = metadata$population,
HOST = metadata$host,
stringsAsFactors = FALSE)
country_colours <-
c("CHN" = "#00A087",
"CMR" = "#902F21",
"DNK" = "#3C5488",
"ESP" = "#E7EAF0",
"HND" = "#4DBBD5",
"NLD" = "#9DAAC4",
"UGA" = "#E64B35",
"LTU" = "#0F1522",
"TZA" = "#F2A59A")
plot_pca_nuc <-
ggplot(data,aes(EV1, EV2, col = COUNTRY, shape = TIME)) +
geom_point(size=4, alpha=1) +
theme_bw() +
labs(x = paste0("PC1 variance: ",round(pca$varprop[1]*100,digits=2),"%"),
y = paste0("PC2 variance: ",round(pca$varprop[2]*100,digits=2),"%")) +
scale_colour_manual(values = country_colours)
plot_pca_nuc_labels <-
ggplot(data,aes(EV1, EV2, col = COUNTRY, shape = TIME, label = POPULATION)) +
geom_point(size=4, alpha=1) +
geom_text_repel() +
theme_bw() +
labs(x = paste0("PC1 variance: ",round(pca$varprop[1]*100,digits=2),"%"),
y = paste0("PC2 variance: ",round(pca$varprop[2]*100,digits=2),"%")) +
scale_colour_manual(values = country_colours)
plot_pca_nuc
# plot
ggsave("plot_PCA_nuclear_samples3x_missing0.8_animalPhonly.pdf", height = 5, width = 5, useDingbats=FALSE)
ggsave("plot_PCA_nuclear_samples3x_missing0.8_animalPhonly.png")
plot_pca_mito + plot_pca_mito_zoom + plot_pca_nuc + plot_layout(ncol=3, guides = "collect")
ggsave("plot_PCA_mito_mitozoom_nuc.pdf", height = 4, width = 14, useDingbats=FALSE)
ggsave("plot_PCA_plot_PCA_mito_mitozoom_nuc.png")
Figure: plot_PCA_nuclear_samples3x_missing0.8_animalPhonly