一跌造、建立參考基因組索引
1埠胖、下載參考基因組和注釋文件
2定罢、構建適用于cellranger分析所用的參考基因組和注釋文件索引(index)笤虫,代碼如下:
##根據自己分析的物種(此處分析的是豬的數(shù)據,其他物種同理)的注釋文件提取gtf信息
cellranger mkgtf chr_susScr11.gtf chr_susScr11.filtered.gtf --attribute=gene_biotype:protein_coding
##根據自己分析的物種參考基因組和注釋文件構建gtf索引
cellranger mkref --genome=chr_susScr11_cellranger \
--nthreads=10 \
--fasta=chr_susScr11.fa \
--genes=chr_susScr11.filtered.gtf
二、比對和計數(shù)分析
1琼蚯、輸入文件
Cellranger count 的fastq輸入文件格式必須要求https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input
格式如下:
[Sample Name]
S1_L00[Lane Number][Read Type]
_001.fastq.gz
Where Read Type is one of:
I1
: Sample index read (optional)*
R1
: Read 1
R2
: Read 2
注意:1)cellranger count的輸入文件格式必須滿足上述要求酬凳;2)后綴必須是fastq或者fastq.gz;如下
2遭庶、構建shell流程批量分析
2 ##Pig single cell data analysis
3 ##########QC 1###########
4 genomedir=/data/youhua_data/Database/susScr11 ##參考基因組index目錄
5 datadir=/data/youhua_data/Data/WRF_singlecell ##文件目錄
6
7
8 sample='Fat1 Fat2 Fat3 Fat4' ##sample名
9 date
10 for s in $sample
11 do
12 date
13 cellranger count --id=${s}_cellrange_out \ ##cellranger count計數(shù)分析
14 --fastqs=$datadir/trim_output_dir/ \
15 --sample=$s \
16 --transcriptome=$genomedir/chr_susScr11_cellranger
17 date
18 wait
19 done
20 exit
3宁仔、輸出文件
1)輸出文件夾目錄
2)outs目錄文件
三、Seurat分析
library(dplyr)
library(Seurat)
pig.data <- Read10X(data.dir = "outs/filtered_feature_bc_matrix/") ##加載cellranger cout分析后的數(shù)據
##創(chuàng)建對象罚拟,min.feature為基因台诗;counts為rawdata(10x genomic)或者TPM
pig <- CreateSeuratObject(counts = pig.data, project = "pig_singlecell", min.cells = 3, min.features = 200)
head(pig)
pig
pig[["percent.mt"]] <- PercentageFeatureSet(pig, pattern = "^MT-") ##添加線粒體基因表達屬性
png("vlnplot.png",width=1000,height=600)
##繪制單個細胞的基因表達個數(shù)完箩、基因表達量以及線粒體基因百分比之間的關系赐俗,用于篩選過濾細胞(死細胞等離群細胞)
VlnPlot(pig, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) ##nFeature_RNA為單個細胞的基因表達個數(shù),nCount_RNA基因表達量弊知,percent.mt為線粒體基因百分比阻逮;
dev.off()
##繪制nCount_RNA、percent.mt以及nFeature_RNA的相關性散點圖秩彤,用于篩選過濾細胞
png("boxplot.png",width=1000,height=600)
plot1 <- FeatureScatter(pig, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pig, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
dev.off()
##根據上面分析結果確定篩選條件
pig <- subset(pig, subset = nFeature_RNA > 200 & nFeature_RNA < 1500 & percent.mt < 5) ##過濾:基因數(shù)目大于200叔扼,不大于2500,MT-percent
##取Log進行數(shù)據標準化處理漫雷,
pig <- NormalizeData(pig, normalization.method = "LogNormalize", scale.factor = 10000)
##尋找變異系數(shù)較大的基因進行分析
pig <- FindVariableFeatures(pig, selection.method = "vst", nfeatures = 1500)
##可視化變異系數(shù)較大的細胞
png("feature.png",width=1000,height=600)
par(mfrow=c(1,2))
top10 <- head(VariableFeatures(pig), 10)
plot1 <- VariableFeaturePlot(pig)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = FALSE)
CombinePlots(plots = list(plot1, plot2))
dev.off()
##
all.genes <- rownames(pig)
pig <- ScaleData(pig, features = all.genes)
pig <- RunPCA(pig, features = VariableFeatures(object = pig))
print(pig[["pca"]], dims = 1:5, nfeatures = 5)
png("dim.png")
VizDimLoadings(pig, dims = 1:2, reduction = "pca")
DimPlot(pig, reduction = "pca")
dev.off()
png("dimheatmap.png")
DimHeatmap(pig, dims = 1:9, cells = 500, balanced = TRUE)
dev.off()
pig <- JackStraw(pig, num.replicate = 100)
pig <- ScoreJackStraw(pig, dims = 1:20)
png("pc_jack.png")
JackStrawPlot(pig, dims = 1:20)
dev.off()
png("elbow.png")
ElbowPlot(pig)
dev.off()
pig <- FindNeighbors(pig, dims = 1:15)
pig <- FindClusters(pig, resolution = 0.4)
pig <- RunUMAP(pig, dims = 1:15)
png("umap.png")
DimPlot(pig, reduction = "umap")
dev.off()
saveRDS(pig, file = "./pig_tutorial.rds")
cluster1.markers <- FindMarkers(pig, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
cluster5.markers <- FindMarkers(pig, ident.1 = 5, ident.2 = c(0, 3), min.pct = 0.25)
head(cluster5.markers, n = 5)
pig.markers <- FindAllMarkers(pig, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
pig.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
write.table(file='tsne.txt' ,Embeddings(pig@reductions$umap),sep="\t",col.names=F,quote=F)
write.table(file='tsne.class' ,pig@active.ident,sep="\t",col.names=F,quote=F)
write.table(pig.markers,sep="\t",file="markers.xlsx",row.names=F)
cluster1.markers <- FindMarkers(pig, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
png("de1.png")
VlnPlot(pig, features = c("DES", "HSPB7"))
dev.off()
png("de2.png")
VlnPlot(pig, features = c("UBE2C", "CKS2"), slot = "counts", log = TRUE)
dev.off()
#FeaturePlot(pig, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP", "CD8A"))
png("gene.png")
FeaturePlot(pig, features = c("ACTA2", "TAGLN", "THBS1", "TPM2", "DES", "CRYAB", "TAGLN", "MYL9","CNN1"))
dev.off()
png("heat.png")
top10 <- pig.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pig, features = top10$gene) + NoLegend()
dev.off()