讀取文件(featurecounts后產(chǎn)生的row count文件)
rm(list=ls())
options(stringsAsFactors = F)
library(tidyverse)
# ggplot2 stringer dplyr tidyr readr purrr tibble forcats
library(data.table) #可多核讀取文件
a1 <- fread('all.featurecounts.txt', header = T, data.table = F)#載入counts转砖,第一列設(shè)置為列名
counts矩陣的構(gòu)建
counts <- a1[,7:ncol(a1)] #截取樣本基因表達(dá)量的counts部分作為counts
rownames(counts) <- a1$Geneid #將基因名作為行名
### 從featurecounts 原始輸出文件counts.txt中提取Geneid锚烦、Length(轉(zhuǎn)錄本長(zhǎng)度)纤房,
geneid_efflen <- subset(a1,select = c("Geneid","Length"))
colnames(geneid_efflen) <- c("geneid","efflen")
geneid_efflen_fc <- geneid_efflen #用于之后比較
### 取出counts中g(shù)eneid的對(duì)應(yīng)的efflen
dim(geneid_efflen)
efflen <- geneid_efflen[match(rownames(counts),
geneid_efflen$geneid),"efflen"]
FPKM/RPKM (Fragments/Reads Per Kilobase Million ) 每千個(gè)堿基的轉(zhuǎn)錄每百萬(wàn)映射讀取的Fragments/reads
# RPKM與FPKM分別針對(duì)單端與雙端測(cè)序而言,計(jì)算公式是一樣的
counts2FPKM <- function(count=count, efflength=efflen){
PMSC_counts <- sum(count)/1e6 #counts的每百萬(wàn)縮放因子 (“per million” scaling factor) 深度標(biāo)準(zhǔn)化
FPM <- count/PMSC_counts #每百萬(wàn)reads/Fragments (Reads/Fragments Per Million) 長(zhǎng)度標(biāo)準(zhǔn)化
FPM/(efflength/1000)
}
FPKM <- as.data.frame(apply(counts,2,counts2FPKM))
colnames(FPKM) <- c("Simmental_1","Simmental_2","Simmental_3","Wagyu_1","Wagyu_2","Wagyu_3") # 修改列名
FPKM <- FPKM[rowSums(FPKM)>=1,] # 去除全部為0的列
colSums(FPKM)
當(dāng)前推薦使用 TPM 進(jìn)行相關(guān)性分析肺孵、PCA分析等 (Transcripts Per Kilobase Million) 每千個(gè)堿基的轉(zhuǎn)錄每百萬(wàn)映射讀取的Transcripts
counts2TPM <- function(count=count, efflength=efflen){
RPK <- count/(efflength/1000) #每千堿基reads (reads per kilobase) 長(zhǎng)度標(biāo)準(zhǔn)化
PMSC_rpk <- sum(RPK)/1e6 #RPK的每百萬(wàn)縮放因子 (“per million” scaling factor ) 深度標(biāo)準(zhǔn)化
RPK/PMSC_rpk
}
TPM <- as.data.frame(apply(counts,2,counts2TPM))
colnames(TPM) <- c("Zebu_1","Zebu_2","Zebu_3","Zebu_4","Zebu_5","Holstein_1","Holstein_2","Holstein_3","Holstein_4","Holstein_5") # 修改列名
TPM <- TPM[rowSums(TPM)>0,] # 去除全部為0的列
colSums(TPM)