前言
在之前的一篇推文:Genome Biology:運用機器學(xué)習(xí)鑒定腫瘤細(xì)胞中,Immugent根據(jù)Ikarus的原文簡單介紹了一下它的功能框架源葫,在本期推文中將通過代碼實操的方式來一步步演示如何使用Ikarus來鑒定腫瘤細(xì)胞严就。
不同于之前的Infercnv和CopyKAT軟件椿争,主要都是基于R語言,Ikarus是由更擅長機器學(xué)習(xí)的python所構(gòu)建的禁炒。事實上破托,在同等運算量的情況下吮炕,python處理數(shù)據(jù)的速度更快参淹,這是因為它可以和其它底層語言銜接的更好醉锄。當(dāng)然,大家也不要有畏懼python的心理浙值,其實使用起來和R基本差不多恳不,不斷去適應(yīng)就好啦。
廢話不多說开呐,下面開始展示烟勋。。筐付。
代碼實操
首先是安裝Ikarus軟件卵惦,要求python>=3.8。
pip install ikarus
#也可以通過下面的代碼安裝
python -m pip install git+https://github.com/BIMSBbioinfo/ikarus.git
接著是導(dǎo)入所需要的程序瓦戚,和R中的library()類似沮尿。
import urllib.request
import anndata
import pandas as pd
from pathlib import Path
from ikarus import classifier, utils, data
接下來是載入示例數(shù)據(jù)。较解。畜疾。
Path("data").mkdir(exist_ok=True)
if not Path("data/laughney20_lung/adata.h5ad").is_file():
Path("data/laughney20_lung").mkdir(exist_ok=True)
urllib.request.urlretrieve(
url="https://bimsbstatic.mdc-berlin.de/akalin/Ikarus/part_1/data/laughney20_lung/adata.h5ad",
filename=Path("data/laughney20_lung/adata.h5ad")
)
if not Path("data/lee20_crc/adata.h5ad").is_file():
Path("data/lee20_crc").mkdir(exist_ok=True)
urllib.request.urlretrieve(
url="https://bimsbstatic.mdc-berlin.de/akalin/Ikarus/part_1/data/lee20_crc/adata.h5ad",
filename=Path("data/lee20_crc/adata.h5ad")
)
if not Path("data/tirosh17_headneck/adata.h5ad").is_file():
Path("data/tirosh17_headneck").mkdir(exist_ok=True)
urllib.request.urlretrieve(
url="https://bimsbstatic.mdc-berlin.de/akalin/Ikarus/part_1/data/tirosh17_headneck/adata.h5ad",
filename=Path("data/tirosh17_headneck/adata.h5ad")
)
paths = [
Path("data/laughney20_lung/"),
Path("data/lee20_crc/"),
Path("data/tirosh17_headneck/")
]
names = [
"laughney",
"lee",
"tirosh"
]
adatas = {}
for path, name in zip(paths, names):
adatas[name] = anndata.read_h5ad(path / "adata.h5ad")
# Uncomment to perform preprocessing. Here, the loaded anndata objects are already preprocessed.
# adatas[name] = data.preprocess_adata(adatas[name])
關(guān)鍵一步,訓(xùn)練模型印衔。庸疾。。
signatures_path = Path("out/signatures.gmt")
pd.read_csv(signatures_path, sep="\t", header=None)
model = classifier.Ikarus(signatures_gmt=signatures_path, out_dir="out")
train_adata_list = [adatas["laughney"], adatas["lee"]]
train_names_list = ["laughney", "lee"]
obs_columns_list = ["tier_0_hallmark_corrected", "tier_0_hallmark_corrected"]
model.fit(train_adata_list, train_names_list, obs_columns_list, save=True)
model_path = Path("out/core_model.joblib")
model = classifier.Ikarus(signatures_gmt=signatures_path, out_dir="out")
model.load_core_model(model_path)
基因的名稱(見示例中的第一列)在簽名中列出当编,GMT文件對應(yīng)于它們有意義的單元格類型,并且是以制表符分隔徒溪。
截止到現(xiàn)在忿偷,我們的模型已經(jīng)搭建完畢,接下來是對新數(shù)據(jù)集進(jìn)行預(yù)測臊泌。
_ = model.predict(adatas["tirosh"], "tirosh", save=True)
import numpy as np
import scanpy as sc
import matplotlib.pyplot as plt
from sklearn import metrics
def plot_confusion_matrix(
y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues, ax=None
):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.rcParams["figure.figsize"] = [6, 4]
# print(classes)
if not title:
if normalize:
title = "Normalized confusion matrix"
else:
title = "Confusion matrix, without normalization"
# Compute confusion matrix
cm = metrics.confusion_matrix(y_true, y_pred, labels=classes)
# Only use the labels that appear in the data
# classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
if ax is None:
(fig, ax) = plt.subplots()
im = ax.imshow(cm, interpolation="nearest", cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(
xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes,
yticklabels=classes,
title=title,
ylabel="True label",
xlabel="Predicted label",
)
for item in (
[ax.title, ax.xaxis.label, ax.yaxis.label]
+ ax.get_xticklabels()
+ ax.get_yticklabels()
):
item.set_fontsize(12)
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = ".2f" if normalize else "d"
thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(
j,
i,
format(cm[i, j], fmt),
ha="center",
va="center",
color="white" if cm[i, j] > thresh else "black",
)
return fig, ax
_ = model.get_umap(adatas["tirosh"], "tirosh", save=True)
path = Path("out/tirosh")
results = pd.read_csv(path / "prediction.csv", index_col=0)
adata = anndata.read_h5ad(path / "adata_umap.h5ad")
y = adata.obs.loc[:, "tier_0"]
y_pred_lr = results["final_pred"]
acc = metrics.balanced_accuracy_score(y, y_pred_lr)
print(metrics.classification_report(y, y_pred_lr, labels=["Normal", "Tumor"]))
fig, ax = plot_confusion_matrix(
y,
y_pred_lr,
classes=["Normal", "Tumor"],
title=f"confusion matrix \n train on laughney+lee, test on tirosh \n balanced acc: {acc:.3f}",
)
fig.tight_layout()
adata.obs.loc[:, "tier_0_pred_correctness"] = "wrongly assigned"
adata.obs.loc[
adata.obs["tier_0"] == adata.obs["final_pred"],
"tier_0_pred_correctness"
] = "correctly assigned"
adata.obs.loc[:, "tier_0_pred_wrong"] = pd.Categorical(
adata.obs["tier_0"].copy(),
categories=np.array(["Normal", "Tumor", "wrongly assigned"]),
ordered=True
)
adata.obs.loc[
adata.obs["tier_0_pred_correctness"] == "wrongly assigned",
"tier_0_pred_wrong"
] = "wrongly assigned"
plt.rcParams["figure.figsize"] = [9, 6]
colors = [
["major"],
["tier_0_pred_wrong"]
]
titles = [
["major types"],
["prediction"]
]
palettes = [
["#7f7f7f", "#98df8a", "#aec7e8", "#9467bd", "#ff0000"],
["#aec7e8", "#ff0000", "#0b559f"],
]
for color, title, palette in zip(colors, titles, palettes):
ax = sc.pl.umap(
adata, ncols=1, size=20,
color=color,
title=title,
wspace=0.25,
vmax="p99",
legend_fontsize=12,
palette=palette,
show=False
)
for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
ax.get_xticklabels() + ax.get_yticklabels()):
item.set_fontsize(12)
ax.legend(loc="best")
plt.tight_layout()
plt.show()
從這個圖可以看出Ikarus預(yù)測的結(jié)果還是相當(dāng)準(zhǔn)確的鲤桥。
說在最后
Ikarus其實不僅可以預(yù)測腫瘤細(xì)胞,也可以預(yù)測其它特殊的細(xì)胞渠概,如各種干細(xì)胞茶凳。這個主要取決于使用者是通過什么數(shù)據(jù)來訓(xùn)練出的模型。Immugent有一個大膽的猜想播揪,Ikarus甚至都可以用于對各種免疫細(xì)胞的注釋贮喧。但是有一個問題是,在不同模型或者疾病中猪狈,各種免疫細(xì)胞的功能狀態(tài)可能差異很大箱沦,鑒于此種考慮,我們不太適合用統(tǒng)一的模型去定義所有同類型免疫細(xì)胞雇庙。
此外谓形,有一說一灶伊,使用python分析數(shù)據(jù)的感覺確實比R更絲滑。而且基于python出的圖寒跳,無論是配色還是構(gòu)圖都看起來高達(dá)上很多聘萨,小伙伴門趕緊實操起來吧。
好啦童太,本期分享到這里就結(jié)束啦米辐,我們下期再會~~