sudo apt-get install zlib1g zlib1g.dev libblas3 libgfortran5 liblapack3 libquadmath0 plink1.9 unzip
sudo apt install dirmngr gnupg apt-transport-https ca-certificates software-properties-common
sudo apt install r-base
1.獲取或者生成基礎(chǔ)數(shù)據(jù)(base data)
Polygenic Risk Score (PRS) 分析第一步就是獲得基礎(chǔ)數(shù)據(jù)(即GWAS統(tǒng)計分析結(jié)果)瀑粥,應(yīng)該包含了與性狀相關(guān)的所有等位基因信息及對應(yīng)效應(yīng)貢獻.
CHR: The chromosome in which the SNP resides---位于第幾條染色體
BP: Chromosomal co-ordinate of the SNP---位于染色體物理位置挣跋,可能出現(xiàn)的形式:POSITION
SNP: SNP ID, usually in the form of rs-ID---SNP編號,通常是rsID
A1: The effect allele of the SNP---效應(yīng)等位基因狞换,可能出現(xiàn)的形式:REF
A2: The non-effect allele of the SNP---非效應(yīng)等位基因避咆,可能出現(xiàn)的形式:ALT
N: Number of samples used to obtain the effect size estimate---用于評估效量值的群體數(shù)量
SE: The standard error (SE) of the effect size esimate---所評估效應(yīng)量值的標準誤差
P: The P-value of association between the SNP genotypes and the base phenotype---所評估表型和基因型的相關(guān)性p值,可能出現(xiàn)的形式:p_value
OR: The effect size estimate of the SNP, if the outcome is binary/case-control. If the outcome is continuous or treated as continuous then this will usually be BETA---所評基因型的效應(yīng)量修噪,Odds Ratio或 Effect Size
INFO: The imputation information score---插補得分查库,可能出現(xiàn)的形式:INFO.plink
MAF: The minor allele frequency (MAF) of the SNP ---所評基因型的效應(yīng)量,可能出現(xiàn)的形式:ALT_FREQ、FRQ
我是從網(wǎng)上下載了別人的GWAS結(jié)果黄琼,是個TXT樊销,目的是要將數(shù)據(jù)從以下排序換成第二行所示:
Chromosome Position RSID REF ALT ALT_FREQ ALT_FREQ_1KGASN RSQ INFO HWE_P Pvalue Qvalue N NullLogLike AltLogLike SNPWeight SNPWeightSE OddsRatio WaldStat NullLogDelta NullGeneticVar NullResidualVar NullBias
CHR BP RSID A1 A2 N SE P OR info MAF
1.1 數(shù)據(jù)排序
首先把TXT轉(zhuǎn)換成CSV,再用PYTHON提取排序后再把CSV轉(zhuǎn)換成TXT
TXT轉(zhuǎn)CSV
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 11:25:52 2022
@author: Bohan
"""
import pandas as pd
df = pd.read_csv("MDD.10640samples.dosages.hwe6info9maf5.logreg.2017.txt",delimiter="\t", low_memory=False)
df.to_csv("MDD.10640samples.dosages.hwe6info9maf5.logreg.2017.csv", encoding='utf-8', index=False)
CSV提取排序
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 10:49:59 2022
@author: Bohan
"""
import csv
with open("MD_GWAS_SNPresults-original.csv") as f, open("originalarranged.csv","w",newline='') as tmp:
r = csv.reader(f)
wr = csv.writer(tmp)
wr.writerows([a,b,c,d,e,m,q,k,r,i,f] for [a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w] in r)
#[a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w] 是原來文件的列順序
#[a,b,c,d,e,m,q,k,r,i,f]是按照原順序提取重排
CSV轉(zhuǎn)TXT
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 12:07:58 2022
@author: Bohan
"""
import csv
a=open('MD_GWAS_SNPresults-original.arranged.csv','r')
reader = csv.reader(a)
with open('MD_GWAS_SNPresults-original.arranged','w') as f:
for i in reader:
for x in i:
f.write(x)
f.write('\t')
f.write('\n')
a.close()
搞完base數(shù)據(jù)看起來這樣
CHR BP RSID A1 A2 N SE P OR info MAF
10 90127 rs185642176 C T 10640 0.004802742809305182 0.4704864679671288 1.0123502866538827 0.905519504189 0.046
10 90164 rs141504207 C G 10640 0.004802742809305182 0.4704864679671288 1.0123502866538827 0.905515996565 0.046
10 94026 rs10904032 G A 10640 0.004807929547222263 0.9907936601472477 1.000113527128114 0.946752744368 0.145
1.2 base數(shù)據(jù)質(zhì)檢
解壓讀取MD_GWAS_SNPresults-original.2015.arranged.txt.gz
輸出文件頭 (NR==1)
輸出MAF大于0.01的數(shù)據(jù)行 (第11列是MAF)
輸出INFO大于0.8 的數(shù)據(jù)行(第10列是INFO)
壓縮結(jié)果到MD2015.gz
gunzip -c MD_GWAS_SNPresults-original.2015.arranged.txt.gz |\
awk 'NR==1 || ($11 > 0.01) && ($10 > 0.8) {print}' |\
gzip > MD2015.gz
根據(jù)第3列的數(shù)據(jù)去重獲得MD2015.nodup.gz(No-duplicate)
gunzip -c MD2015.gz |\
awk '{seen[$3]++; if(seen[$3]==1){ print}}' |\
gzip - > MD2015.nodup.gz
根據(jù)第四第五行數(shù)值去除Ambiguous SNPs
gunzip -c MD2015.nodup.gz |\
awk '!( ($4=="A" && $5=="T") || \
($4=="T" && $5=="A") || \
($4=="G" && $5=="C") || \
($4=="C" && $5=="G")) {print}' |\
gzip > MD2015.QC.gz
2.目標數(shù)據(jù)(target data)的質(zhì)檢QC
本案例用的是illumina ASA v1 SNP芯片檢出的原始文件(*.idat文件)
直接去下載illumina的genomestudio軟件
https://files.softwaredownloads.illumina.com/5831d9df-95cb-4427-a7c0-499fe871e1d5/genomestudio-software-v2-0-5-0-installer.zip
2.1 創(chuàng)建分析項目
要用的還有Infinium Asian Screening Array v1.0 Manifest File (BPM Format - GRCh37)
Infinium Asian Screening Array v1.0 Cluster File
2.2 評估參照數(shù)據(jù)
2.3 評估樣品和SNP
具體理論可以參考這個GSA/ASA芯片質(zhì)控 - 簡書 (jianshu.com)
樣品評估
Call Rate(檢出率) 樣本檢出率: 是指對于某種樣本而言丈氓,通過測序并成功判刑的snp與所有檢出的snp的比值湾笛,通常標準在90%或以上嚎研。
LogR Dev用于評估是否有樣品污染
SNP評估
Call Frequency(檢出頻率)用于SNP檢出的樣品覆蓋程度
GenTrain Scoreillumina自己的算法
樣品評估
Call Rate太低的不要
SNP評估
基本流程就是創(chuàng)建樣品-添加manifest信息蚜退,參考基因組按照GWAS對應(yīng)版本钻注,利用自帶插件plink-input-report-plugin-v2-1-4到處.ped和.map文件
plink1.9 --file new --out new --make-bed
plink1.9 --bfile new --maf 0.01 --hwe 1e-6 --geno 0.02 --mind 0.02 --write-snplist --make-just-fam --out new.QC
plink1.9 --bfile new --keep new.QC.fam --extract new.QC.snplist --indep-pairwise 200 50 0.25 --out new.QC
plink1.9 --bfile new --extract new.QC.prune.in --keep new.QC.fam --het --out new.QC
sudo R
install.packages("data.table")
library(data.table)
dat <- fread("new.QC.het")
valid <- dat[F<=mean(F)+3*sd(F) & F>=mean(F)-3*sd(F)]
fwrite(valid[,c("FID","IID")], "new.valid.sample", sep="\t")
install.packages("magrittr")
install.packages("R.utils")
library(magrittr)
bim <- fread("new.bim") %>%
setnames(., colnames(.), c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")) %>%
.[,c("B.A1","B.A2"):=list(toupper(B.A1), toupper(B.A2))]
MD <- fread("MD2015.QC.gz") %>%
.[,c("A1","A2"):=list(toupper(A1), toupper(A2))]
qc <- fread("new.QC.snplist", header=F)
info <- merge(bim, MD, by=c("SNP", "CHR", "BP")) %>%
.[SNP %in% qc[,V1]]
complement <- function(x){
switch (x,
"A" = "T",
"C" = "G",
"T" = "A",
"G" = "C",
return(NA)
)
info.match <- info[A1 == B.A1 & A2 == B.A2, SNP]
com.snps <- info[sapply(B.A1, complement) == A1 &
sapply(B.A2, complement) == A2, SNP]
bim[SNP %in% com.snps, c("B.A1", "B.A2") :=
list(sapply(B.A1, complement),
sapply(B.A2, complement))]
recode.snps <- info[B.A1==A2 & B.A2==A1, SNP]
bim[SNP %in% recode.snps, c("B.A1", "B.A2") :=
list(B.A2, B.A1)]
com.recode <- info[sapply(B.A1, complement) == A2 &
sapply(B.A2, complement) == A1, SNP]
bim[SNP %in% com.recode, c("B.A1", "B.A2") :=
list(sapply(B.A2, complement),
sapply(B.A1, complement))]
fwrite(bim[,c("SNP", "B.A1")], "EUR.a1", col.names=F, sep="\t")
mismatch <- bim[!(SNP %in% info.match |
SNP %in% com.snps |
SNP %in% recode.snps |
SNP %in% com.recode), SNP]
write.table(mismatch, "EUR.mismatch", quote=F, row.names=F, col.names=F)
q()
plink1.9 --bfile new --extract new.QC.prune.in --keep new.valid.sample --check-sex --out new.QC
valid <- fread("new.valid.sample")
dat <- fread("new.QC.sexcheck")[FID%in%valid$FID]
fwrite(dat[STATUS=="OK",c("FID","IID")], "new.QC.valid", sep="\t")
q() # exit R
plink1.9 --bfile new --extract new.QC.prune.in --keep new.QC.valid --rel-cutoff 0.125 --out new.QC
plink1.9 --bfile new --make-bed --keep new.QC.rel.id --out new.QC --extract new.QC.snplist
-----------------------------------------------------------------------------
library(data.table)
dat <- fread("MD2015.QC.gz")
fwrite(dat[,BETA:=log(OR)], "new.QC.Transformed", sep="\t")
q() # exit R
plink1.9 --bfile new.QC --clump-p1 1 --clump-r2 0.1 --clump-kb 250 --clump new.QC.Transformed --clump-snp-field SNP --clump-field P --out new
awk 'NR!=1{print $3}' new.clumped > new.valid.snp
awk '{print $3,$8}' new.QC.Transformed > new.pvalue
echo "0.001 0 0.001" > range_list
echo "0.05 0 0.05" >> range_list
echo "0.1 0 0.1" >> range_list
echo "0.2 0 0.2" >> range_list
echo "0.3 0 0.3" >> range_list
echo "0.4 0 0.4" >> range_list
echo "0.5 0 0.5" >> range_list
plink1.9 --bfile new.QC --score new.QC.Transformed 3 4 12 header --q-score-range range_list new.pvalue --extract new.valid.snp --out new
# First, we need to perform prunning
plink1.9 --bfile new.QC --indep-pairwise 200 50 0.25 --out new
# Then we calculate the first 6 PCs
plink1.9 --bfile new.QC --extract new.prune.in --pca 6 --out new
library(data.table)
library(magrittr)
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
phenotype <- fread("new.phenotype")
pcs <- fread("new.eigenvec", header=F) %>%
setnames(., colnames(.), c("FID", "IID", paste0("PC",1:6)) )
covariate <- fread("new.cov")
pheno <- merge(phenotype, covariate) %>%
merge(., pcs)
~~~~~
null.r2 <- summary(lm(Trait1~., data=pheno[,-c("FID", "IID")]))$r.squared
prs.result <- NULL
for(i in p.threshold){
pheno.prs <- paste0("new.", i, ".profile") %>%
fread(.) %>%
.[,c("FID", "IID", "SCORE")] %>%
merge(., pheno, by=c("FID", "IID"))
model <- lm(Trait1~., data=pheno.prs[,-c("FID","IID")]) %>%
summary
model.r2 <- model$r.squared
prs.r2 <- model.r2-null.r2
prs.coef <- model$coeff["SCORE",]
prs.result %<>% rbind(.,
data.frame(Threshold=i, R2=prs.r2,
P=as.numeric(prs.coef[4]),
BETA=as.numeric(prs.coef[1]),
SE=as.numeric(prs.coef[2])))
}
print(prs.result[which.max(prs.result$R2),])
q() # exit R
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
Read in the phenotype file
phenotype <- read.table("new.phenotype", header=T)
Read in the PCs
pcs <- read.table("new.eigenvec", header=F)
The default output from plink does not include a header
To make things simple, we will add the appropriate headers
(1:6 because there are 6 PCs)
colnames(pcs) <- c("FID", "IID", paste0("PC",1:6))
Read in the covariates (here, it is sex)
covariate <- read.table("new.cov", header=T)
Now merge the files
pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID"))
We can then calculate the null model (model with PRS) using a linear regression
(as height is quantitative)
null.model <- glm(Trait1~., data=pheno[,!colnames(pheno)%in%c("FID","IID")])
And the R2 of the null model is
null.r2 <- summary(null.model)r.squared
# R2 of PRS is simply calculated as the model R2 minus the null R2
prs.r2 <- model.r2-null.r2
# We can also obtain the coeffcient and p-value of association of PRS as follow
prs.coef <- summary(model)$coeff["SCORE",]
prs.beta <- as.numeric(prs.coef[1])
prs.se <- as.numeric(prs.coef[2])
prs.p <- as.numeric(prs.coef[4])
# We can then store the results
prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
}
Best result is:
prs.result[which.max(prs.result$R2),]
q() # exit R