Learning Objectives

  • Define population structure
  • Go through examples of manifestation of population strucutre
  • Learn methods to correct for population structure
    • genomic control
    • principal component analysis

Material for the class

  • Download notes on population structure from here

Prerequisites for running the analysis below

  • plink installed in ~/bin/
  • Rstudio is installed
  • tidyverse package installed
  • devtools package installed

Test Hardy Weinberg equilibrium with population structure

What’s the population composition?

popinfo = read_tsv(paste0(work.dir,"relationships_w_pops_051208.txt"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   FID = col_character(),
##   IID = col_character(),
##   dad = col_character(),
##   mom = col_character(),
##   sex = col_double(),
##   pheno = col_double(),
##   population = col_character()
## )
popinfo %>% count(population)
## # A tibble: 11 x 2
##    population     n
##  * <chr>      <int>
##  1 ASW           90
##  2 CEU          180
##  3 CHB           90
##  4 CHD          100
##  5 GIH          100
##  6 JPT           91
##  7 LWK          100
##  8 MEX           90
##  9 MKK          180
## 10 TSI          100
## 11 YRI          180
samdata = read_tsv(paste0(work.dir,"phase3_corrected.psam"),guess_max = 2500) 
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   `#IID` = col_character(),
##   PAT = col_character(),
##   MAT = col_character(),
##   SEX = col_double(),
##   SuperPop = col_character(),
##   Population = col_character()
## )
superpop = samdata %>% select(SuperPop,Population) %>% unique()
superpop = rbind(superpop, data.frame(SuperPop=c("EAS","HIS","AFR"),Population=c("CHD","MEX","MKK")))

Effect of population structure in Hardy Weinberg equilibrium

## what happens if we calculate HWE with this mixed population?
if(!file.exists(glue::glue("{work.dir}output/allhwe.hwe")))
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --hardy --out {work.dir}output/allhwe"))
allhwe = read.table(glue::glue("{work.dir}output/allhwe.hwe"),header=TRUE,as.is=TRUE)
hist(allhwe$P)

qqunif(allhwe$P,main='HWE HapMap3 All Pop')
## Warning in qqunif(allhwe$P, main = "HWE HapMap3 All Pop"): thresholding p to
## 1e-30

What if we calculate with single population?

pop = "CHB"
pop = "CEU"
pop = "YRI"
for(pop in c("CHB","CEU","YRI"))
{
  ## what if we calculate with single population?
  popinfo %>% filter(population==pop) %>%
    write_tsv(path=glue::glue("{work.dir}{pop}.fam") )
  if(!file.exists(glue::glue("{work.dir}output/hwe-{pop}.hwe")))
  system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --hardy --keep {work.dir}{pop}.fam --out {work.dir}output/hwe-{pop}"))
  pophwe = read.table(glue::glue("{work.dir}output/hwe-{pop}.hwe"),header=TRUE,as.is=TRUE)
  hist(pophwe$P,main=glue::glue("HWE {pop} and founders only"))
  qqunif(pophwe$P,main=glue::glue("HWE {pop} and founders only"))
}
## Warning: The `path` argument of `write_tsv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

Effect of population stratification on GWAS

GWAS on a growth phenotype in HapMap samples

## read igrowth
igrowth = read_tsv("https://raw.githubusercontent.com/hakyimlab/igrowth/master/rawgrowth.txt")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   IID = col_character(),
##   sex = col_double(),
##   pop = col_character(),
##   experim = col_double(),
##   meas.by = col_double(),
##   serum = col_character(),
##   growth = col_double()
## )
## add FID to igrowth file
igrowth = popinfo %>% select(-pheno) %>% inner_join(igrowth %>% select(IID,growth), by=c("IID"="IID"))
write_tsv(igrowth,path=glue::glue("{work.dir}igrowth.pheno"))
igrowth %>% ggplot(aes(population,growth)) + geom_violin(aes(fill=population)) + geom_boxplot(width=0.2,col='black',fill='gray',alpha=.8) + theme_bw(base_size = 15)
## Warning: Removed 130 rows containing non-finite values (stat_ydensity).
## Warning: Removed 130 rows containing non-finite values (stat_boxplot).

summary( lm(growth~population,data=igrowth) )
## 
## Call:
## lm(formula = growth ~ population, data = igrowth)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -58821 -18093  -2242  15896  98760 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    73080.8      938.2  77.894  < 2e-16 ***
## populationCEU  -2190.1     1175.4  -1.863   0.0625 .  
## populationCHB   9053.1     2043.9   4.429 9.73e-06 ***
## populationJPT   3476.8     2034.8   1.709   0.0876 .  
## populationYRI  -7985.2     1137.2  -7.022 2.61e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24160 on 3591 degrees of freedom
##   (130 observations deleted due to missingness)
## Multiple R-squared:  0.0345, Adjusted R-squared:  0.03342 
## F-statistic: 32.08 on 4 and 3591 DF,  p-value: < 2.2e-16
## run plink linear regression only if it hasn't been run already
if(!file.exists(glue::glue("{work.dir}output/igrowth.assoc.linear")))
system(glue::glue("~/bin/plink --bfile {work.dir}hapmapch22 --linear --pheno {work.dir}igrowth.pheno --pheno-name growth --maf 0.05 --out {work.dir}output/igrowth"))
igrowth.assoc = read.table(glue::glue("{work.dir}output/igrowth.assoc.linear"),header=T,as.is=T)
hist(igrowth.assoc$P)

qqunif(igrowth.assoc$P)

## install.packages("qqman")
library(qqman)
## 
## For example usage please run: vignette('qqman')
## 
## Citation appreciated but not required:
## Turner, S.D. qqman: an R package for visualizing GWAS results using Q-Q and manhattan plots. biorXiv DOI: 10.1101/005165 (2014).
## 
manhattan(igrowth.assoc, chr="CHR", bp="BP", snp="SNP", p="P" )

Reuse

Text and figures are licensed under Creative Commons Attribution CC BY 4.0. The source code is licensed under MIT.

Suggest changes

If you find any mistakes (including typos) or want to suggest changes, please feel free to edit the source file of this page on Github and create a pull request.

Citation

For attribution, please cite this work as

Haky Im (2021). Lecture 6 - Population Structure. HGEN 471 Class Notes. /post/2021/01/31/lecture-6-population-structure/

BibTeX citation

@misc{
  title = "Lecture 6 - Population Structure",
  author = "Haky Im",
  year = "2021",
  journal = "HGEN 471 Class Notes",
  note = "/post/2021/01/31/lecture-6-population-structure/"
}