Mouse pancreatic endocrinogenesis (scATAC-seq) | Model evaluation

Model assessment

Here we compared the weights learned by several model iterations, to highlight

  • Performance user parameter combinations

  • Interpretation based on TF regulators

  • Coherence or learned graph-weights when assisting RNA-weights, or only using ATAC-weights

print('here...')
here...
%load_ext autoreload
%autoreload 2

If you wish to test this notebook but have not trained models, you can download example output from this cloud link. Dropbox URL

Four files are required

  • Model output (*.pth) using GraphLayer (log_dynamic=1) and not using it (log_dynamic=0)

  • ATAC (atac_train) and RNA (rna_sample_train).

  • Training dataloader (train_dataloader).

If a file for testing is n/a, or there are clarifications while loading, please reach out via Github.

# pip install matplotlib==3.7.2
import torch
import mubind as mb
import scanpy as sc
# the prepring case (4148)
# Graph Layer with hadamard product 8161 x 50000
n_obs = 8161
n_var = 50000
save_code = '_obs%i_var%i' % (n_obs, n_var)
# load models
model_by_logdynamic = {}
for use_logdynamic in [False, True]:
    p = "pancreas_multiome_use_logdynamic_%i_obs8161_var50000.pth" % use_logdynamic
    print(p)
    model_by_logdynamic[use_logdynamic] = torch.load(p)
pancreas_multiome_use_logdynamic_0_obs8161_var50000.pth
pancreas_multiome_use_logdynamic_1_obs8161_var50000.pth
!ls -ltrh /home/ilibarra/workspace/theislab/mubind/docs/notebooks/single_cell/atac_train*
-rwxrwxrwx 1 ilibarra ilibarra 289K Aug 27 00:03 /home/ilibarra/workspace/theislab/mubind/docs/notebooks/single_cell/atac_train.h5ad
-rwxrwxrwx 1 ilibarra ilibarra 209M Aug 30 12:10 /home/ilibarra/workspace/theislab/mubind/docs/notebooks/single_cell/atac_train_obs8161_var50000.h5ad
# this loads the atac and RNA objects subsampled and used during training
ad = sc.read_h5ad('atac_train%s.h5ad' % save_code)
rna_sample = sc.read_h5ad('rna_sample_train_obs%i.h5ad' % (n_obs))
import pickle


train = pickle.load(open('train_dataloader%s.pkl' % save_code, "rb"))
%load_ext line_profiler
!readlink -f .
/mnt/c/Users/IgnacioIbarra/Dropbox/workspace/theislab/mubind/docs/notebooks/single_cell
# load the pancreas multiome dataset
rna, atac = mb.datasets.pancreas_multiome(data_directory="../../../../../../annotations/single_cell")
True data/scatac/pancreas_multiome/pancreas_multiome_2022_processed_rna_velocities_2024.h5ad
True data/scatac/pancreas_multiome/pancreas_multiome_2022_processed_atac.h5ad
reading ATAC
opening ATAC successful
# %lprun -f model.forward model.optimize_iterative(train, n_epochs=10, skip_kernels=list([0]) + list(range(2, 500)), opt_kernel_shift=[0, 0] + [0] * (n_kernels), opt_kernel_length=[0, 0] + [0] * (n_kernels))
# %lprun -f model.binding_modes.forward model.optimize_iterative(train, n_epochs=10, skip_kernels=list([0]) + list(range(2, 500)), opt_kernel_shift=[0, 0] + [0] * (n_kernels), opt_kernel_length=[0, 0] + [0] * (n_kernels))
import matplotlib.pyplot as plt
cell_type_key = "celltype"
for optimize_log_dynamic in model_by_logdynamic:
    print("Graph Layer | ", optimize_log_dynamic)
    model = model_by_logdynamic[optimize_log_dynamic]
    # print(optimize_log_dynamic)
    from matplotlib import rcParams

    rcParams["figure.figsize"] = 10, 5
    rcParams["figure.dpi"] = 100
    mb.pl.logo(model, n_cols=3, show=True, n_rows=6, stop_at=4)  #  log=True)
    plt.show()
Graph Layer |  False
break
../../_images/c2303c0b1cce98176ab310d8862d79fdf0d7f05137553f9ed5cb24b8658ccc87.png
Graph Layer |  True
break
../../_images/dd9b79c1c707fb7ef53f3b48c2d08c5b4d269e939c7037264f3a4783eb4c67a0.png
for optimize_log_dynamic in model_by_logdynamic:
    if not optimize_log_dynamic:
        continue
    model = model_by_logdynamic[optimize_log_dynamic]
    print(optimize_log_dynamic)

    tsum = torch.sum
    texp = torch.exp
    tspa = torch.sparse_coo_tensor
    tsmm = torch.sparse.mm
    t = torch.transpose

    # connectivities
    C = model.graph_module.conn_sparse
    a_ind = C.indices()

    log_dynamic = model.graph_module.log_dynamic
    D = model.graph_module.log_dynamic
    D_tril = tspa(a_ind, D, C.shape)  # .requires_grad_(True).cuda()
    D_triu = tspa(a_ind, -D, C.shape)  # .requires_grad_(True).cuda()
    D = D_tril + t(D_triu, 0, 1)
    # log_dynamic = log_dynamic + -torch.transpose(log_dynamic, 0, 1)
    # triu_indices = torch.triu_indices(row=n_rounds, col=n_rounds, offset=1)

    import seaborn as sns

    mb.pl.set_rcParams({"figure.figsize": [3, 3]})
    sns.heatmap(D.to_dense().detach().cpu(), cmap="RdBu_r")
    plt.show()
True
../../_images/ad7d5ee260e259f77361fce22531d415e8912a178be6534731caad01e56991f9.png
model = model_by_logdynamic[1]
mb.pl.set_rcParams({"figure.figsize": [12, 3], "figure.dpi": 110})
plt.subplot(1, 4, 1)
plt.plot(model.loss_history_log_dynamic)
plt.ylabel("log dynamic loss")
plt.subplot(1, 4, 2)
plt.plot(model.loss_history)
plt.ylabel("overall loss")
plt.subplot(1, 4, 3)
plt.plot(model.loss_history_sym_weights)
plt.ylabel("similar weights loss")
plt.tight_layout()

plt.savefig('losses_pancreatic_endocrinogenesis.pdf')
plt.show()
../../_images/cbcb94acc30c12bbd69154f5cdd7d3427f4d901bf28d295095d3ea96171a083d.png
import pandas as pd
import numpy as np
rcParams["figure.figsize"] = 3, 5
r2_all = []
for optimize_log_dynamic in model_by_logdynamic:
    print(optimize_log_dynamic)
    model = model_by_logdynamic[optimize_log_dynamic]
    # contributions per newly added kernel
    import seaborn as sns

    if len(model.best_r2_by_new_filter) != 0:
        r2 = pd.DataFrame(model.best_r2_by_new_filter, columns=["r2"]).reset_index()
        r2["opt_log_dynamic"] = optimize_log_dynamic
        r2_all.append(r2)

if len(r2_all) > 0:
    r2_all = pd.concat(r2_all)
    rcParams["figure.figsize"] = 3, 3
    rcParams["figure.dpi"] = 80
    ax = sns.barplot(
        data=r2_all,
        x="index",
        y="r2",
        hue="opt_log_dynamic",
    )
    sns.move_legend(ax, "lower center", bbox_to_anchor=(0.4, 1), ncol=3, title=None, frameon=False)

    plt.xlabel("number of filters in model")
    plt.show()
False
True
../../_images/a51944a46c677746148b2c644a84fe4bbfe1024efa6794e224d4ded5f43f8400.png
model = model_by_logdynamic[True]

torch.set_printoptions(precision=2)
dynamic_score = D.to_dense().detach().cpu().sum(axis=0)
# dyn_score
dynamic_score = dynamic_score
dynamic_score = (dynamic_score - dynamic_score.min()) / (dynamic_score.max() - dynamic_score.min())
ad.obs["dynamic_score"] = dynamic_score

ad.obs["dynamic_score_cluster"] = np.where(dynamic_score > dynamic_score.mean(), "dynamic", "static")
z0 = np.where(((dynamic_score - dynamic_score.mean()) / dynamic_score.std()) > 0.75, "dynamic", "static")
z1 = np.where(((dynamic_score - dynamic_score.mean()) / dynamic_score.std()) > 1, "dynamic", "static")
z2 = np.where(((dynamic_score - dynamic_score.mean()) / dynamic_score.std()) > 2, "dynamic", "static")

ad.obs["dynamic_score_z0"] = z0
ad.obs["dynamic_score_z1"] = z1
ad.obs["dynamic_score_z2"] = z2
ad.obs[[c for c in ad.obs if 'dynamic' in c]].describe()
dynamic_score
count 8161.000000
mean 0.308962
std 0.040538
min 0.000000
25% 0.308942
50% 0.308962
75% 0.308974
max 1.000000
ad.obs["dynamic_score_abs"] = ad.obs["dynamic_score"].abs()
sc.pl.umap(ad, color="dynamic_score_abs", color_map="Reds", vmin=0.45)
../../_images/b6424a9b8574344f9175b613f392cdcce0b5adc54d37caa81ff385becea46399.png
# contributions per newly added kernel
mb.pl.set_rcParams({"figure.figsize": [3, 3], "figure.dpi": 60})

sc.pl.umap(ad, color=["dynamic_score"], cmap="RdBu_r", sort_order=True)

for z in ['z0', 'z1', 'z2']:
    print(z)
    sc.pl.umap(ad, color=["dynamic_score_%s" % z], cmap="RdBu_r", sort_order=True)
    sc.tl.embedding_density(ad, basis="umap", groupby="dynamic_score_%s" % z)
    sc.pl.embedding_density(
        ad, basis="umap", key="umap_density_dynamic_score_%s" % z, group="dynamic"
    )  # basis='umap', groupby='dynamic_score_cluster')
../../_images/43f133e07cce436ad4823bd1bcbfa325aae96e8fff664f6e5c0a7ad1ad719762.png
z0
../../_images/0f7439d800fe8939eadb9cd0be489fe550e63715b53203a206e58794945e9200.png ../../_images/1c4132d76e4c53f727a5f2f04db362a381feb20453a7a663774be85c7827ed9c.png
z1
../../_images/3b37e3cb0de2420fa0738a004c0178900b9dc4c90b569063ef810bd3dc9dd9d9.png ../../_images/2f942241f8e6a64f824fe5ad8f15e3093ee67697d679f0973925a9632d741f34.png
z2
../../_images/7b9eaf67282a1ec01d3303d75e6e1010f779cd7b0569dd0a9a868d453c571e64.png ../../_images/30cd1b2ab6901e2e843002b04ededf562065dea1ffd102d62309ce348d62414d.png
import seaborn as sns

umap = ad.obsm["X_umap"]
sns.histplot(x=umap[:, 0], y=umap[:, 1], bins=50, cmap="PiYG")
<Axes: >
../../_images/6bfe5e9431bef5ec381d0730341a2867eb974758a46f61728204689c85a233f5.png
plt.pcolormesh(np.histogram2d(umap[:, 0], umap[:, 1], bins=50)[0])
<matplotlib.collections.QuadMesh at 0x7fc886ceadc0>
../../_images/7b337d1bce83bf5c01b673403f6d03450aa0d53d46528ad9e19a2063c3ed9275.png
x, y = np.meshgrid(umap[:, 0], umap[:, 1])
x = umap[:, 1]  # array_txt[:,0]
y = umap[:, 1]  # array_txt[:,1]
z = ad.obs["dynamic_score"].values  # array_txt[:,2]
sc.pl.umap(ad, color="dynamic_score")
../../_images/4facd6ead45558e877c80c185ffe7264cae825d7815024d7f2738a00a3dd88dd.png
import matplotlib.pyplot as plt
import numpy as np

rcParams["figure.figsize"] = 5, 3

# generate 2 2d grids for the x & y bounds
y, x = np.meshgrid(np.linspace(-3, 3, 100), np.linspace(-3, 3, 100))
z = (1 - x / 2.0 + x**5 + y**3) * np.exp(-(x**2) - y**2)
z = z[:-1, :-1]
z_min, z_max = -np.abs(z).max(), np.abs(z).max()
fig, ax = plt.subplots()
c = ax.pcolormesh(x, y, z, cmap="RdBu", vmin=z_min, vmax=z_max)
ax.set_title("pcolormesh")
# set the limits of the plot to the limits of the data
ax.axis([x.min(), x.max(), y.min(), y.max()])
fig.colorbar(c, ax=ax)

plt.show()
../../_images/f54f39754d22bf9ae7a4319b10929c137e5538be7c27cf868370d99ce2767ab1.png
import itertools
import numpy as np


def grid(x, y, z, size_x=1, size_y=1):

    def pairwise(iterable):
        "s -> (s0,s1), (s1,s2), (s2, s3), ..."
        a, b = itertools.tee(iterable)
        next(b, None)
        return zip(a, b)

    minx, maxx = int(min(x)), int(max(x)) + 1
    miny, maxy = int(min(y)), int(max(y)) + 1

    result = []
    x_edges = pairwise(np.arange(minx, maxx + 1, size_x))
    for xleft, xright in x_edges:
        xmask = np.logical_and(x >= xleft, x < xright)
        y_edges = pairwise(np.arange(miny, maxy + 1, size_y))
        for yleft, yright in y_edges:
            ymask = np.logical_and(y >= yleft, y < yright)
            cell = z[np.logical_and(xmask, ymask)]
            result.append(cell.sum())

    result = np.array(result).reshape((maxx - minx, maxy - miny))
    return np.flip(result.T, 0)
grid_dyn_score = grid(umap[:, 0], umap[:, 1], ad.obs["dynamic_score"], size_x=1, size_y=1)
grid_counts = grid(umap[:, 0], umap[:, 1], ad.obs["celltype"].cat.codes.values, size_x=1, size_y=1)
sns.heatmap(grid_dyn_score, cmap="Reds")
plt.show()
sns.heatmap(grid_counts, cmap="Reds")
plt.show()

sc.pl.umap(ad, color="celltype")
../../_images/ea73cc4f13015b63713bf99d21a65ac141c2df2df8be825c76864dba7af94ee1.png ../../_images/d2b5f3b5970699667aa509f1d65ec3c7199591346fe60156718dc2b8ec4621df.png ../../_images/9b9570ee03fb98083967292c396816761f23c6b731923d0b21d99e05fcf24c05.png
# for optimize_log_dynamic in model_by_logdynamic:
#     mb.pl.set_rcParams({'figure.figsize': [3, 3], 'figure.dpi': 90})
#     print(optimize_log_dynamic)
#     model = model_by_logdynamic[optimize_log_dynamic]
#     mb.pl.kmer_enrichment(model, train, log_scale=False, style='scatter', ylab='t1', xlab='p1', k=8)
#     plt.show()

#     mb.pl.set_rcParams({'figure.figsize': [10, 7], 'figure.dpi': 90})
#     mb.pl.logo(model,
#                title=False,
#                xticks=False,
#                rowspan_dinuc=0,
#                rowspan_mono=1,
#                n_rows=12,
#                n_cols=3,
#                stop_at=20) # n_cols=len(reduced_groups))
#     plt.show()
model = model_by_logdynamic[True]
G = model.graph_module.conn_sparse.detach().cpu().to_dense()  # (C, C)
# number of non_zero weights
len(G[G != 0])
111684
# output = model(**inputs, use_conn=False, return_binding_scores=True)
ad
AnnData object with n_obs × n_vars = 8161 × 50000
    obs: 'n_counts', 'sample', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'S_score', 'G2M_score', 'phase', 'proliferation', 'celltype', 'nCount_peaks', 'nFeature_peaks', 'dynamic_score', 'dynamic_score_cluster', 'dynamic_score_z0', 'dynamic_score_z1', 'dynamic_score_z2', 'dynamic_score_abs', 'umap_density_dynamic_score_z0', 'umap_density_dynamic_score_z1', 'umap_density_dynamic_score_z2'
    var: 'modality', 'acc_score', 'acc_score_rank', 'chr', 'start', 'end', 'summit', 'summit.start', 'summit.end', 'k.summit'
    uns: 'celltype_colors', 'neighbors', 'dynamic_score_z0_colors', 'umap_density_dynamic_score_z0_params', 'dynamic_score_z1_colors', 'umap_density_dynamic_score_z1_params', 'dynamic_score_z2_colors', 'umap_density_dynamic_score_z2_params'
    obsm: 'X_pca', 'X_pca_wsnn', 'X_spca_wsnn', 'X_umap', 'X_umap_ATAC', 'X_umap_GEX', 'X_umap_wsnn', 'lsi_full', 'lsi_red', 'umap', 'umap_ATAC', 'umap_GEX'
    obsp: 'connectivities', 'connectivities_wnn', 'distances', 'distances_wnn'
model = model.cuda()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device
for optimize_log_dynamic in model_by_logdynamic:
    print(optimize_log_dynamic)
    if not optimize_log_dynamic:
        continue
    model = model_by_logdynamic[optimize_log_dynamic].cuda()

    umap = ad.obsm["X_umap"].copy()
    umap = np.sort(umap, 0)

    x = umap[:, 0]
    y = umap[:, 1]
    X, Y = np.meshgrid(x, y)

    n_points = x.shape[0]
    # x-component to the right
    u = np.ones((n_points, n_points))
    # y-component zero
    v = np.zeros((n_points, n_points))

    pred = []
    binding_per_mode = []
    for i, batch in enumerate(train):
        # Get a batch and potentially send it to GPU memory.
        mononuc = batch["mononuc"].to(device)
        # print(i, mononuc.shape)
        b = batch["batch"].to(device) if "batch" in batch else None
        rounds = batch["rounds"].to(device) if "rounds" in batch else None
        countsum = batch["countsum"].to(device) if "countsum" in batch else None
        seq = batch["seq"] if "seq" in batch else None
        residues = batch["residues"].to(device) if "residues" in batch else None
        if residues is not None and train.dataset.store_rev:
            mononuc_rev = batch["mononuc_rev"].to(device)
            inputs = {"mono": mononuc, "mono_rev": mononuc_rev, "batch": b, "countsum": countsum, "residues": residues}
        elif residues is not None:
            inputs = {"mono": mononuc, "batch": b, "countsum": countsum, "residues": residues}
        elif train.dataset.store_rev:
            mononuc_rev = batch["mononuc_rev"].to(device)
            inputs = {"mono": mononuc, "mono_rev": mononuc_rev, "batch": b, "countsum": countsum}
        else:
            inputs = {"mono": mononuc, "batch": b, "countsum": countsum}

        inputs["scale_countsum"] = model.datatype == "selex"
        output = model(**inputs, use_conn=False, return_binding_scores=True)
        output = output.cpu().detach().numpy()
        # print("here...")
        # print(output.shape)
        # print(output.sum())
        pred.append(output)


        output = model(**inputs, use_conn=False, return_binding_per_mode=True)
        output = output.cpu().detach().numpy()
        # print("here...")
        # print(output.shape)
        # print(output.sum())
        binding_per_mode.append(output)

    # pred = np.concatenate(pred).T

    binding_scores = np.concatenate(pred).T
    binding_per_mode = np.concatenate(binding_per_mode)

    # ad.layers['velocity'] = pred

    # conn = model.graph_module.conn_sparse.detach().cpu()
    # conn = model.graph_module.conn_sparse.detach().cpu().to_dense()
    # v = conn.sum(axis=1)
    # ad.layers['velocity'] = torch.stack([v,] * ad.shape[1], axis=1).numpy()

    # ad.layers['counts'] = ad.X

    # mb.pl.set_rcParams({'figure.figsize': [5, 4], 'figure.dpi': 90})
    # plt.hist(model.graph_module.conn_sparse.values().detach().cpu().numpy())
    # plt.show()

    # import scvelo as scv

    # sc.pp.neighbors(ad)

    # # scv.tl.velocity_graph(ad, vkey='velocity', xkey='counts')
    # # ad.layers['velocity'] = ad.obs['dynamic_score']

    # scv.tl.velocity_graph(ad, vkey='velocity', xkey='counts')
    # ax = scv.pl.velocity_embedding_stream(ad, color='celltype', show=False) #  X_grid='X_umap', V=V)


X = ad.X.A
G @ binding_scores
np.random.shuffle(binding_scores)
False
True
ad.shape, binding_per_mode.shape
((8161, 50000), (50000, 288))
import scvelo as scv
ad.shape, binding_per_mode.shape
((8161, 50000), (50000, 288))
ad
ad.layers["velocity"] = binding_scores
# scv.tl.velocity_graph(ad, vkey='velocity', xkey='counts')
# ax = scv.pl.velocity_embedding_stream(ad, color='celltype', show=False) #  X_grid='X_umap', V=V)
np.random.shuffle(binding_scores)
binding_scores
array([[2326852.2 , 2244617.5 , 2257599.  , ..., 1032774.  , 1101569.4 ,
         961977.94],
       [2324817.8 , 2242753.2 , 2255925.8 , ..., 1032444.9 , 1101183.8 ,
         961662.06],
       [2325369.8 , 2243328.2 , 2256494.2 , ..., 1032580.2 , 1101346.9 ,
         961793.75],
       ...,
       [2327202.2 , 2244936.2 , 2257899.  , ..., 1032709.25, 1101466.1 ,
         961898.75],
       [2325772.2 , 2243165.2 , 2255936.8 , ..., 1032531.44, 1101306.5 ,
         961750.56],
       [2325220.8 , 2243139.8 , 2256282.5 , ..., 1032513.7 , 1101264.5 ,
         961729.3 ]], dtype=float32)
try:
    scv.pl.velocity_embedding_stream(rna_sample, color="celltype")
except Exception:
    print("sample too small.")
../../_images/890e00b9d144d1fa1a0527c618a305b7da3dbc6d85b5d8da30aab1a536bf3793.png
# np.random.shuffle(binding_scores)
# ad.layers['velocity'] = binding_scores
# scv.tl.velocity_graph(ad, vkey='velocity', xkey='counts')
# ax = scv.pl.velocity_embedding_stream(ad, color='celltype', show=False) #  X_grid='X_umap', V=V)
import seaborn as sns


act = model.get_log_activities().detach().cpu().squeeze(0)

sns.heatmap(act, cmap="RdBu_r", cbar_kws={"label": "activities"})
<Axes: >
../../_images/3392bca82da1d25730cb6b3743f2b0d96ad2d640fba887b5de2678558043cacb.png
scv.pl.velocity_graph(rna)
../../_images/dac4d70560996e8a98d4d44125df99af6b01acf721b78b8b4ac8459b570df695.png
# ax = scv.pl.velocity_embedding_stream(ad,
#                                       color='celltype',
#                                       # density=2,
#                                       arrow_color='black',
#                                       n_neighbors=15) # show=False) #  X_grid='X_umap', V=V)
# ax = scv.pl.velocity_embedding_stream(ad, color='celltype', density=2, arrow_color='black', n_neighbors=15) # show=False) #  X_grid='X_umap', V=V)
# scv.pl.velocity_embedding_stream(ad, color='celltype', n_neighbors=15) #  X_grid='X_umap', V=V)

Study the asssociations betweeen obtained weights and cluster-specific transcription factors

Load information from archetypes DB (Vierstra et al 2020)

rna_sample, ad.shape
(AnnData object with n_obs × n_vars = 8161 × 14663
     obs: 'n_counts', 'sample', 'n_genes', 'log_genes', 'mt_frac', 'rp_frac', 'ambi_frac', 'nCount_RNA', 'nFeature_RNA', 'nCount_ATAC', 'nFeature_ATAC', 'nucleosome_signal', 'nucleosome_percentile', 'TSS.enrichment', 'TSS.percentile', 'S_score', 'G2M_score', 'phase', 'proliferation', 'celltype', 'nCount_peaks', 'nFeature_peaks', 'sample_batch', 'initial_size_unspliced', 'initial_size_spliced', 'initial_size', 'batch', 'velocity_self_transition'
     var: 'modality', 'Accession', 'Chromosome', 'End', 'Start', 'Strand', 'gene_count_corr', 'velocity_gamma', 'velocity_qreg_ratio', 'velocity_r2', 'velocity_genes'
     uns: 'celltype_colors', 'neighbors', 'velocity_graph', 'velocity_graph_neg', 'velocity_params'
     obsm: 'X_pca', 'X_pca_wsnn', 'X_spca_wsnn', 'X_umap', 'X_umap_ATAC', 'X_umap_GEX', 'X_umap_wsnn', 'lsi_full', 'lsi_red', 'umap', 'umap_ATAC', 'umap_GEX', 'velocity_umap'
     layers: 'Ms', 'Mu', 'ambiguous', 'matrix', 'spliced', 'unspliced', 'variance_velocity', 'velocity'
     obsp: 'connectivities', 'distances',
 (8161, 50000))
rna_sel = rna_sample  # rna[rna.obs_names.isin(ad.obs_names),:].copy()
rna_sel.shape
(8161, 14663)
pwd
'/mnt/c/Users/IgnacioIbarra/Dropbox/workspace/theislab/mubind/docs/notebooks/single_cell'
import bindome as bd


bd.constants.ANNOTATIONS_DIRECTORY = "annotations"
anno = mb.datasets.archetypes_anno()
rna_sel.shape
anno.sort_values("Cluster_ID")
Cluster_ID Name DBD Seed_motif Total_width Consensus_left Consensus_right Cluster_size
61 1 HD/1 homeodomain LHX6_homeodomain_3 12 0 12 2
72 2 HD/2 homeodomain ALX3_MA0634.1 26 8 16 186
79 3 HD/3 homeodomain VENTX_homeodomain_2 21 3 20 1
80 4 HD/4 homeodomain BARX1_MOUSE.H11MO.0.C 17 5 13 17
81 5 HD/5 homeodomain BARX1_homeodomain_1 21 2 18 6
... ... ... ... ... ... ... ... ...
163 282 PAX/2 PAX PAX5_HUMAN.H11MO.0.A 24 4 21 3
161 283 PAX-halfsite PAX Pax2_MA0067.1 8 1 7 1
0 284 AHR bHLH AHR_HUMAN.H11MO.0.B 9 2 8 3
105 285 KLF/SP/3 C2H2 KLF8_HUMAN.H11MO.0.C 9 0 9 2
285 286 ZSCAN4 C2H2 ZSCAN4_C2H2_1 15 1 14 2

286 rows × 8 columns

for optimize_log_dynamic in model_by_logdynamic:
    print(optimize_log_dynamic)
    model = model_by_logdynamic[optimize_log_dynamic]
    log_act = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)
    log_act = pd.DataFrame(log_act.detach().cpu().numpy())
    # log_act.columns = anno['Seed_motif'][2]
    # log_act.columns = ['intercept', 'dinuc_bias'] + list(anno['Seed_motif'].values)
    log_act.index = ad.obs_names
    ad.obsm["mubind_activities"] = log_act

    mb.pl.set_rcParams({"figure.figsize": [5, 3], "figure.dpi": 110})
    delta = log_act.max(axis=0) - log_act.min(axis=0)
    var = log_act.var(axis=0)
    plt.scatter(delta, var, color="gray", edgecolors="black")
    plt.xlabel("effect size")
    plt.ylabel("variability")
    plt.title("TF modules (by score) | GraphLayer = %i" % optimize_log_dynamic)
    plt.show()
False
../../_images/814a5db3112afb87015c96ed7d070856eacf67f1721efee477f3cb0a12c0cc47.png
True
../../_images/61527627a162f7bcad843485470b1086ab3647c482cd9752770031bc767d79b2.png
# unique names for annotation
names = anno["Name"]  # .sort_values('Name')
added = dict()
new_name = []
for name in names:
    if not name in added:
        new_name.append(name)
        added[name] = 0
    else:
        new_name.append(name + "_%i" % added[name])
        added[name] += 1
anno["Name_unique"] = new_name
from scipy.stats import spearmanr


res = []

for optimize_log_dynamic in model_by_logdynamic:

    if not optimize_log_dynamic:
        continue


    model = model_by_logdynamic[optimize_log_dynamic]

    log_act = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)

    log_act = pd.DataFrame(log_act.detach().cpu().numpy())

    # log_act.columns = anno['Seed_motif'][2]

    log_act.columns = ["intercept", "dinuc_bias"] + list(range(1, 287))

    log_act.index = ad.obs_names

    ad.obsm["mubind_activities"] = log_act


    mb.pl.set_rcParams({"figure.figsize": [5, 3], "figure.dpi": 90})

    delta = log_act.max(axis=0) - log_act.min(axis=0)

    var = log_act.var(axis=0)

    plt.scatter(delta, var)

    plt.xlabel("min-max range")

    plt.ylabel("variability")

    plt.title("TF modules (by score)")

    plt.show()


    for c in log_act:

        a = log_act[c]

        b = ad.obs["dynamic_score"].values

        # print(a.shape, b.shape)

        res.append([optimize_log_dynamic, c] + list(spearmanr(a, b)))


res = pd.DataFrame(res, columns=["opt_log_dynamic", "archetype_id", "spearman", "p_val"])
../../_images/059cecffe7a5879bb00cefb8cceeba8c5548788d6fffc3dee84e6b4a7b7cd4d9.png
# add archetypes name
meta = pd.DataFrame(pd.concat([delta, var], axis=1))
meta.columns = ["max_effect", "variability"]
meta["name"] = ["intercept", "dinuc_bias"] + list(range(1, 287))
clu = mb.datasets.archetypes_clu()
meta["archetypes_name"] = meta["name"].map(anno.set_index("Cluster_ID")["Name_unique"])
meta["archetypes_name"] = np.where(pd.isnull(meta["archetypes_name"]), meta["name"], meta["archetypes_name"])

meta["archetypes_seed"] = meta["name"].map(anno.set_index("Cluster_ID")["Seed_motif"])
meta = meta.sort_values("max_effect", ascending=0)
meta

res = res.merge(meta, left_on="archetype_id", right_on="name")
res = res.sort_values("p_val", ascending=True)
name_by_filter_id = meta["archetypes_name"].to_dict()
# name_by_filter_id

Observe general scores per binding layer filter

rcParams["figure.figsize"] = 3, 5
sns.barplot(
    data=res.sort_values("max_effect", ascending=False).head(25), x="max_effect", y="archetypes_name", color="orange"
)
<Axes: xlabel='max_effect', ylabel='archetypes_name'>
../../_images/02fb9c0d8162d571190d7c13a02e1aad7b2fe9f33eb0df84b0de988004b61d49.png
pd.options.display.width = 1000
# visualize the logos as obtained by the model in each step
mb.pl.set_rcParams({"figure.figsize": [5, 20], "figure.dpi": 90})
# mb.pl.logo(model, title=False, xticks=False, rowspan_dinuc=0, rowspan_mono=1, n_rows=40, n_cols=1, stop_at=5, log_odds=False)
mb.pl.logo(model, title=False, xticks=False, rowspan_dinuc=0, rowspan_mono=1, n_rows=40, n_cols=1, stop_at=20, log_odds=True)
# n_rows=len(res.head(20).index),
break
../../_images/40a84e8b23d6d422c7e7198f58d1165dde85fbc653cf1571d5ecb4b0b8a5e9db.png
mb.pl.set_rcParams({"figure.figsize": [2, 20], "figure.dpi": 90})
mb.pl.logo(
    model,
    title=False,
    xticks=False,
    rowspan_dinuc=0,
    rowspan_mono=1,
    n_rows=40,
    # n_rows=len(res.head(20).index),
    n_cols=1,
    order=res.head(20).index,
)  # n_cols=len(reduced_groups))
plt.tight_layout()
plt.show()
../../_images/d6e073d1b77db55be79de0c819ed3cf8b349eb34d80c1d1f42f8db9a576f367f.png
<Figure size 180x1800 with 0 Axes>
import resource

print("total GB used:", resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e6)
total GB used: 24.692796
# for k in ad.obsm['log_activities'].iloc[:,2:]:
#     ad.obs[str(k)] = ad.obsm['log_activities'][k]
# sc.pl.umap(ad, color=map(str, ad.obsm['log_activities'].iloc[:,2:]), cmap='Reds')
rna_sel.obsm["X_umap"] = ad.obsm["X_umap"]
def find_varname(ad, k, shuffle=False):
    if not shuffle:
        return ad.var_names[ad.var_names.str.upper().str.startswith(k.upper())]
    else:
        ad_sel = ad.var_names[ad.var_names.str.upper().str.startswith(k.upper())]
        return pd.Series(ad.var_names).sample(ad_sel.shape[0]).values
from scipy.stats import spearmanr, pearsonr

Calculate global correlations between the activities obtained per motif and gene-specific expression

all_targets = set()
for optimize_log_dynamic in model_by_logdynamic:
    print(optimize_log_dynamic)
    model = model_by_logdynamic[optimize_log_dynamic]

    log_act = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)
    log_act = pd.DataFrame(log_act.detach().cpu().numpy())
    log_act.index = ad.obs_names

    ad.obsm["log_activities"] = log_act
    ad.obsm["log_activities"].columns = ["intercept", "dinuc_bias"] + list(range(1, 287))

    # collect all targets
    for k in ad.obsm["log_activities"].iloc[:, 2:]:
        log_act = ad.obsm["log_activities"][k].values
        names = set()
        clu_sel = clu[clu["Cluster_ID"] == k]["Motif"]
        for g in clu_sel:
            names.add(g.split("_")[0].split(".")[0].split("+")[0].upper())
        for g in anno[anno["Cluster_ID"] == k]["Seed_motif"]:
            names.add(g.split("_")[0].split(".")[0])
        # print(k, names)
        targets = set()
        for name in names:
            target = find_varname(rna_sel, name)
            for t in target:
                all_targets.add(t)
        if len(targets) > 0 and False:
            sc.pl.umap(rna_sel, color=targets, cmap="Reds")
False
True
def get_act_gene_corr(model_by_logdynamic, shuffle=False, random_state=0):
    print('association between motif activities and related TF targets (shuffle = %i)' % shuffle)
    res = []
    # print(len(all_targets))

    rna_sel_df = rna_sel.to_df()

    for optimize_log_dynamic in model_by_logdynamic:
        print('use GraphLayer = %i' % optimize_log_dynamic)
        model = model_by_logdynamic[optimize_log_dynamic]

        log_act = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)
        log_act = pd.DataFrame(log_act.detach().cpu().numpy())
        log_act.index = ad.obs_names

        ad.obsm['log_activities'] = log_act
        ad.obsm['log_activities'].columns = ['intercept', 'dinuc_bias'] + list(range(1, 287))

        # if shuffle:
        #     random_cols = ad.obsm['log_activities'].iloc[:,2:].columns.values
        #     np.random.shuffle(random_cols)
        #     # random_cols
        
        for ki, k in enumerate(ad.obsm['log_activities'].iloc[:,2:]):
            next_log_act = ad.obsm['log_activities'][k].values

            # if shuffle:
            #     next_log_act = ad.obsm['log_activities'][random_cols[ki]].values
            
            # print(ki)
            # if ki % 30 == 0:
            #     print(ki)
            names = set()
            clu_sel = clu[clu['Cluster_ID'] == k]['Motif']
            for g in clu_sel:
                names.add(g.split('_')[0].split('.')[0].split('+')[0].upper())
            for g in anno[anno['Cluster_ID'] == k]['Seed_motif']:
                names.add(g.split('_')[0].split('.')[0])
            # print(k, names)
            next_targets = set()
            for name in names:
                target = find_varname(rna_sel, name, shuffle=shuffle)
                # print(name, target)
                for t in target:
                    next_targets.add(t)

            # for t in all_targets:
            for t in set(all_targets).intersection(next_targets):
                gex = rna_sel_df[[t]].to_numpy() # rna_sel_df[t].A
                assert gex.shape[1] == 1
                gex = gex.flatten()
                # print(next_log_act.shape, gex.shape)
                # print(t, pearsonr(next_log_act, gex))
                res.append([ki, optimize_log_dynamic, k, t, t in next_targets] +
                        list(spearmanr(next_log_act, gex)))
                
    res = pd.DataFrame(res, columns=['filter_id', 'opt_log_dynamic', 'archetype_id', 'gene_name', 'matched', 'spearman', 'p_val'])

    # p-values
    res['module_name'] = res['archetype_id'].map(anno.set_index('Cluster_ID')['Name'].to_dict())
    res['p_val'] = np.where(pd.isnull(res['p_val']), 1.0, res['p_val'])
    
    # p-val adjust
    from statsmodels.stats.multitest import fdrcorrection
    res['p_adj'] = fdrcorrection(res['p_val'])[1]
    return res
res = get_act_gene_corr(model_by_logdynamic)
print('\nRunning permutations...')
n_perm = 10
shuffled = []
for i in range(n_perm):
    print(i)
    shuffled.append(get_act_gene_corr(model_by_logdynamic, shuffle=1, random_state=i))
# shuffled = [get_act_gene_corr(model_by_logdynamic, shuffle=1, random_state=i) for i in range(50)]
association between motif activities and related TF targets (shuffle = 0)
use GraphLayer = 0
use GraphLayer = 1

Running permutations...
0
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
1
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
2
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
3
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
4
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
5
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
6
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
7
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
8
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
9
association between motif activities and related TF targets (shuffle = 1)
use GraphLayer = 0
use GraphLayer = 1
table = []
for use_graph in [False, True]:
    for thr in range(1, 10):
        sel = res[res['opt_log_dynamic'] == use_graph]
        next_thr = 10 ** (-thr)
        n_pos = sel[sel['p_adj'] < next_thr].shape[0]
        n_neg = [s[(s['p_adj'] < next_thr) & (s['opt_log_dynamic'] == use_graph)].shape[0] for s in shuffled]
        # print(next_thr, n_pos, np.mean(n_neg), np.std(n_neg), (n_pos - np.mean(n_neg)) / np.std(n_neg))
        table.append([next_thr, n_pos, np.mean(n_neg), np.std(n_neg), (n_pos - np.mean(n_neg)) / np.std(n_neg), use_graph])
table = pd.DataFrame(table, columns=['p_adj_thr', 'n_pos', 'mu', 'sigma', 'zscore', 'graph_layer'])
table.pivot(index='graph_layer', columns='p_adj_thr', values='zscore')
p_adj_thr 1.000000e-09 1.000000e-08 1.000000e-07 1.000000e-06 1.000000e-05 1.000000e-04 1.000000e-03 1.000000e-02 1.000000e-01
graph_layer
False 25.955710 27.872501 27.910294 35.532982 37.300271 38.454778 51.310414 54.805231 68.835313
True 10.512276 13.535528 13.312500 15.649224 26.665466 35.235294 35.433265 33.995949 63.204079
rcParams['figure.figsize'] = 4, 1
rcParams['figure.dpi'] = 120
rcParams['pdf.fonttype'] = 42
hm = table.pivot(index='graph_layer', columns='p_adj_thr', values='n_pos').fillna(0)
z = table.pivot(index='graph_layer', columns='p_adj_thr', values='zscore').fillna(0)
sns.heatmap(z, annot=hm, fmt='', cmap='Blues', # vmin=0, vmax=5,
            cbar_kws={'label': 'Z-score\n(permutations)'})

plt.savefig('../../../output/zscore_filter_act_gex_cor_pancreatic_endocrinogesis.pdf')
plt.show()
../../_images/b995ac8a0824a20e357b568230cd6f9158e44b4d795425d4566a8079363bd343.png
genes_by_module_name = (
    res.groupby(["module_name"])["gene_name"].apply(lambda grp: list(grp.value_counts().index)).to_dict()

)
# genes_by_module_name
res.sort_values("p_adj")
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj
277 68 False 69 Meis2 True 0.253829 3.531138e-120 MIES 1.363019e-116
1711 245 False 246 Rfx6 True -0.246340 4.372267e-113 RFX/3 8.438475e-110
1588 150 False 151 Rest True 0.237454 5.574720e-105 REST/NRSF 7.172807e-102
1377 81 False 82 Pou3f4 True 0.227013 7.048015e-96 POU/2 6.801334e-93
1436 95 False 96 Ehf True 0.219672 9.477964e-90 ETS/2 7.316988e-87
... ... ... ... ... ... ... ... ... ...
3449 108 True 109 Egr1 True 0.000016 9.988476e-01 KLF/SP/2 9.996254e-01
2723 69 True 70 Thumpd1 True -0.000016 9.988485e-01 TBX/1 9.996254e-01
1152 69 False 70 Tnfrsf22 True 0.000005 9.996116e-01 TBX/1 9.998707e-01
2930 69 True 70 Tpd52l1 True 0.000008 9.993975e-01 TBX/1 9.998707e-01
1406 91 False 92 Sox11 True -0.000002 9.998756e-01 SOX/3 9.998756e-01

3860 rows × 9 columns

res["k"] = res["gene_name"] + "_" + res["archetype_id"].astype(str)
df2 = res.pivot(index="k", columns="opt_log_dynamic", values="spearman")
# df2 = res # .pivot(index='k', columns='opt_log_dynamic', values='spearman')
df2
opt_log_dynamic False True
k
Ahr_284 0.041996 0.023966
Aire_139 0.008289 0.021390
Ap2a1_264 -0.028934 -0.017731
Ap2a2_264 -0.020486 0.005195
Ap2b1_264 -0.050461 -0.016013
... ... ...
Zfp820_121 0.020229 0.011756
Zfp821_121 0.001718 -0.016109
Zfp825_121 0.018146 0.009312
Zfp827_121 -0.031124 -0.014224
Zfx_144 0.027819 0.030549

1930 rows × 2 columns

mb.pl.set_rcParams({"figure.figsize": [5, 4], "figure.dpi": 120})
# df2 = df2.sort_values('matched', ascending=True)
# plt.scatter(df2[True], df2[True],
#             color=np.where(df2['matched'], 'blue', 'gray'),
#             s=np.where(df2['matched'], 30, 5))
# plt.xlabel('TF activity (graph = off)')
# plt.ylabel('TF activity (graph = on)')
# plt.axhline(0, color='gray', ls='--', zorder=0)
# plt.axvline(0, color='gray', ls='--', zorder=0)
# df2[df2['matched'] == True].sort_values(True, ascending=False)
# res['arch_name'] = name_by_filter_id
res
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj k
0 1 False 2 Pou6f1 True -0.012835 2.463036e-01 HD/2 4.397465e-01 Pou6f1_2
1 1 False 2 Pou6f2 True -0.004708 6.706317e-01 HD/2 8.131867e-01 Pou6f2_2
2 1 False 2 Nkx6-1 True 0.019021 8.575862e-02 HD/2 2.067634e-01 Nkx6-1_2
3 1 False 2 Dlx1as True -0.011318 3.066163e-01 HD/2 5.133309e-01 Dlx1as_2
4 1 False 2 Lmx1b True -0.011210 3.112775e-01 HD/2 5.183482e-01 Lmx1b_2
... ... ... ... ... ... ... ... ... ... ...
3855 280 True 281 Pax6 True -0.080454 3.368429e-13 PAX/1 9.490611e-12 Pax6_281
3856 280 True 281 Pax6os1 True -0.041961 1.495984e-04 PAX/1 1.075326e-03 Pax6os1_281
3857 282 True 283 Pax2 True -0.003662 7.408379e-01 PAX-halfsite 8.568437e-01 Pax2_283
3858 283 True 284 Ahr True 0.023966 3.038693e-02 AHR 9.556552e-02 Ahr_284
3859 284 True 285 Klf8 True -0.019348 8.050853e-02 KLF/SP/3 1.976863e-01 Klf8_285

3860 rows × 10 columns

rcParams["figure.figsize"] = 4, 4
rcParams["figure.dpi"] = 90

for optimize_log_dynamic, grp in res.groupby("opt_log_dynamic"):
    grp["minus_log10_pval"] = -np.log10(grp["p_val"])
    grp = grp.sort_values("matched")
    plt.scatter(
        grp["spearman"],
        grp["minus_log10_pval"],
        s=np.power(grp["minus_log10_pval"], 2),
        color=np.where(grp["matched"], "red", "blue"),
    )
    plt.ylabel("-log(p-adj)")
    plt.xlabel("spearman")
    plt.title("corr(filter, GEX) | GraphLayer = %i" % optimize_log_dynamic)
    plt.axhline(1, ls="--", color="red", lw=0.6)
    plt.show()
../../_images/d2326193ea88f078b688e6c6ad62ed1911dc68f856feb94ebfbaf53a967c925b.png ../../_images/66d34b13ad0f6707485653de91c8b89a7761e0196a3a0321c5c5c71b1f8421af.png
# sc.pl.umap(ad, color=[96], cmap='RdBu_r')
# sc.pl.umap(rna_sel, color=['Ehf', 'Ergic2'], cmap='plasma')
rcParams["figure.figsize"] = 3, 3
rcParams["figure.dpi"] = 90
plt.hist(res["p_val"], color="gray", bins=20, label="raw", alpha=0.5, edgecolor="black")
plt.hist(res["p_adj"], color="red", bins=20, label="adjusted (BH)", alpha=0.5, edgecolor="black")
plt.xlabel("p-value")
plt.legend()
plt.ylabel("# associations")
Text(0, 0.5, '# associations')
../../_images/83e4d98a0a251be2f71456f4690d7b216dc0f8253aa0985f82f9535eeba7905c.png
res[res["p_adj"] < 0.05]
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj k
9 1 False 2 Pdx1 True 0.032841 3.006220e-03 HD/2 1.455961e-02 Pdx1_2
11 1 False 2 Pax4 True -0.033665 2.353099e-03 HD/2 1.184219e-02 Pax4_2
18 7 False 8 Arx True -0.081863 1.295229e-13 HD/8 3.845833e-12 Arx_8
19 7 False 8 Arxes2 True 0.034815 1.657269e-03 HD/8 8.787167e-03 Arxes2_8
20 9 False 10 Hnf1aos1 True 0.035551 1.317555e-03 HD/10 7.163043e-03 Hnf1aos1_10
... ... ... ... ... ... ... ... ... ... ...
3842 262 True 263 Tfap2c True 0.027537 1.285757e-02 TFAP2/2 4.823149e-02 Tfap2c_263
3847 264 True 265 Ctcf True 0.046348 2.806301e-05 CTCF 2.391241e-04 Ctcf_265
3849 267 True 268 Plag1 True -0.028093 1.115049e-02 PLAG1 4.317042e-02 Plag1_268
3855 280 True 281 Pax6 True -0.080454 3.368429e-13 PAX/1 9.490611e-12 Pax6_281
3856 280 True 281 Pax6os1 True -0.041961 1.495984e-04 PAX/1 1.075326e-03 Pax6os1_281

1033 rows × 10 columns

pval_thr = 1e-5
sel_genes = set(list(res[res["p_adj"] < pval_thr]["gene_name"]))
log_act = ad.obsm["log_activities"].copy()
log_act.shape
(8161, 288)
cols_act = ["intercept", "dinuc_bias"] + [name_by_filter_id[k] for k in log_act.columns[2:]]
log_act.columns = cols_act
log_act
intercept dinuc_bias HD/1 HD/2 HD/3 HD/4 HD/5 HD/6 HD/7 HD/8 ... GMEB2/2 GMEB2/3 FOX/9 SIX/2 PAX/1 PAX/2 PAX-halfsite AHR KLF/SP/3 ZSCAN4
AAACAGCCAACAGCCT-1-0 3.806271e-08 0.000003 7.695544e-05 0.000185 0.000027 0.000018 0.000150 0.000090 0.000008 0.000290 ... 0.000683 0.000745 0.000040 -0.000298 0.000297 0.000039 0.000093 0.002540 -0.000023 0.000095
AAACAGCCAACCCTCC-1-0 -1.162185e-07 -0.000003 4.249436e-07 -0.001115 -0.000109 -0.000263 -0.000553 -0.000185 -0.001000 -0.002746 ... -0.000548 -0.005312 -0.000712 -0.001427 -0.005037 -0.000265 -0.002616 -0.003395 -0.000373 -0.000473
AAACAGCCACTAAGCC-1-0 1.481323e-06 -0.000031 4.224365e-04 0.061922 0.004885 0.001461 0.002665 0.008734 0.030506 -0.058731 ... -0.048243 0.090099 0.075321 0.071688 0.167776 0.009557 0.061679 -0.104413 0.061235 -0.030773
AAACAGCCAGGATAAC-1-0 -2.535439e-08 -0.000002 -6.218390e-06 -0.000374 -0.000033 -0.000034 -0.000129 -0.000039 -0.000187 -0.000504 ... -0.000163 -0.000923 -0.000504 -0.000577 -0.000655 -0.000044 -0.000708 -0.002082 0.000082 -0.000144
AAACAGCCATAAAGCA-1-0 1.866635e-07 0.000009 1.157708e-04 0.001055 0.000114 0.000441 0.000247 0.000247 0.000514 0.003035 ... 0.002118 0.002020 0.003678 0.002987 0.003853 0.000299 0.001518 0.003987 0.001712 0.000473
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
TTTGTGTTCACGAATC-1-1 -3.571261e-08 -0.000003 -2.040713e-05 -0.000369 -0.000042 -0.000082 -0.000164 -0.000066 -0.000259 -0.000607 ... -0.000259 -0.001202 -0.000679 -0.000618 -0.000922 -0.000043 -0.000762 -0.001915 -0.000268 -0.000198
TTTGTGTTCGAGCTAT-1-1 -3.510256e-08 -0.000002 2.103153e-05 -0.000582 -0.000046 -0.000057 -0.000249 -0.000077 -0.000564 -0.001008 ... -0.000085 -0.001882 -0.000534 -0.001164 -0.000909 -0.000017 -0.001656 -0.002895 -0.000303 -0.000225
TTTGTGTTCTCAATGA-1-1 -4.718245e-08 -0.000003 -3.709015e-05 -0.000300 -0.000047 -0.000153 -0.000134 -0.000069 -0.000327 -0.000638 ... -0.000368 -0.001265 -0.000826 -0.000719 -0.001065 -0.000046 -0.000662 -0.001764 -0.000471 -0.000174
TTTGTTGGTATTGAGT-1-1 -3.771812e-08 -0.000003 -1.994044e-05 -0.000378 -0.000042 -0.000116 -0.000139 -0.000064 -0.000266 -0.000597 ... -0.000290 -0.001464 -0.000433 -0.000576 -0.000906 -0.000041 -0.000935 -0.001973 -0.000344 -0.000205
TTTGTTGGTTGTTGCT-1-1 -4.738262e-08 -0.000003 -3.046263e-05 -0.000377 -0.000069 -0.000157 -0.000204 -0.000086 -0.000439 -0.001027 ... -0.000442 -0.001635 -0.000916 -0.000986 -0.000826 -0.000029 -0.001108 -0.002171 -0.000469 -0.000263

8161 rows × 288 columns

import anndata


ad_act = anndata.AnnData(log_act)

ad_act.obsm["X_umap"] = ad.obsm["X_umap"]

ad_act.obs = ad.obs
sc.pl.umap(ad_act, color="celltype")
../../_images/7d32b23bb36733a3bcce8fdfb8a21d93f4750596b94dea19b98a0d7974976f97.png

Rank genes groups using the annotation

sc.tl.rank_genes_groups(ad_act, "celltype")
rkg_df = []
for ct in ad_act.obs["celltype"].values.unique():
    print(ct)
    rkg_df2 = sc.get.rank_genes_groups_df(ad_act, ct)
    rkg_df2["celltype"] = ct
    rkg_df.append(rkg_df2)
rkg_df = pd.concat(rkg_df)
rkg_df["module_name"] = rkg_df["names"].map(anno.set_index("Cluster_ID")["Name"].to_dict())
rkg_df["module_name"] = np.where(~pd.isnull(rkg_df["module_name"]), rkg_df["module_name"], rkg_df["names"])
rkg_df.head()
Fev+ Alpha
Fev+
Ngn3 high
Eps. progenitors
Ngn3 low
Fev+ Beta
Ductal
Fev+ Delta
Imm. Acinar
Alpha
Prlf. Ductal
Epsilon
Ngn3 high cycling
Beta
Delta
Mat. Acinar
names scores logfoldchanges pvals pvals_adj celltype module_name
0 GCM 0.910906 3.654080 0.362733 0.627158 Fev+ Alpha GCM
1 ZNF431 0.515329 4.179523 0.606397 0.835609 Fev+ Alpha ZNF431
2 NR/15 0.422957 NaN 0.672541 0.905102 Fev+ Alpha NR/15
3 HD/16 0.403896 NaN 0.686494 0.916561 Fev+ Alpha HD/16
4 ARI5B 0.402635 NaN 0.687421 0.916561 Fev+ Alpha ARI5B

Get top modules

ad_act.var_names = ad_act.var_names.map(rkg_df.set_index("names")["module_name"].to_dict())
sc.tl.rank_genes_groups(ad_act, "celltype")
rcParams["figure.figsize"] = 3.5, 3.5
rcParams["figure.dpi"] = 80
sc.pl.rank_genes_groups(ad_act)
../../_images/86a290317e64ce0c12eda80361dc9012bca4aca5a8d1ff5a6ab4ba2632bcdb2d.png
set(res[(res["p_adj"] < 1e-5)]["k"])
{'Araf_248',
 'Arc_248',
 'Arcn1_248',
 'Arel1_248',
 'Arf3_248',
 'Arf5_248',
 'Arf6_248',
 'Arfgap1_248',
 'Arfgap3_248',
 'Arfgef1_248',
 'Arfgef2_248',
 'Arfgef3_248',
 'Arhgap10_248',
 'Arhgap11a_248',
 'Arhgap18_248',
 'Arhgap19_248',
 'Arhgap21_248',
 'Arhgap24_248',
 'Arhgap26_248',
 'Arhgap28_248',
 'Arhgap32_248',
 'Arhgap35_248',
 'Arhgap36_248',
 'Arhgap42_248',
 'Arhgap5_248',
 'Arhgap6_248',
 'Arhgdig_248',
 'Arhgef11_248',
 'Arhgef12_248',
 'Arhgef19_248',
 'Arhgef26_248',
 'Arhgef38_248',
 'Arhgef39_248',
 'Arhgef40_248',
 'Arhgef5_248',
 'Arhgef6_248',
 'Arhgef7_248',
 'Arhgef9_248',
 'Arid1b_248',
 'Arid2_248',
 'Arid3b_248',
 'Arid4b_248',
 'Arih1_248',
 'Arih2_248',
 'Arl13b_248',
 'Arl15_248',
 'Arl3_248',
 'Arl4a_248',
 'Arl4c_248',
 'Arl6ip1_248',
 'Arl6ip4_248',
 'Arl8a_248',
 'Armc4_248',
 'Armc8_248',
 'Armc9_248',
 'Armh3_248',
 'Arnt_248',
 'Arpc1a_248',
 'Arpc1b_248',
 'Arpc2_248',
 'Arpc5_248',
 'Arpc5l_248',
 'Arpp19_248',
 'Arrdc4_248',
 'Arsa_248',
 'Arx_248',
 'Arx_8',
 'Bhlha15_62',
 'Cebpa_52',
 'Cebpb_52',
 'Cebpd_52',
 'Ctcf_265',
 'Cux1_14',
 'Dbpht2_52',
 'Ebf1_174',
 'Ehf_96',
 'Ehf_98',
 'Elf5_96',
 'Elf5_98',
 'Erf_98',
 'Erg28_96',
 'Erg28_98',
 'Ergic2_96',
 'Ets1_96',
 'Ets1_98',
 'Etv1_98',
 'Etv6_96',
 'Etv6_98',
 'Fev_96',
 'Fev_98',
 'Foxo1_164',
 'Foxo1_77',
 'Foxo1_79',
 'Foxo1_80',
 'Foxo6_80',
 'Foxp1_79',
 'Gata4_242',
 'Gli3_262',
 'Glis3_261',
 'Hes1_59',
 'Id2_58',
 'Insm1_266',
 'Irf2_104',
 'Irf2_106',
 'Irf2bp2_104',
 'Irf2bp2_106',
 'Irf2bpl_104',
 'Irf2bpl_106',
 'Isl1_2',
 'Jun_49',
 'Jund_50',
 'Klf12_109',
 'Klf15_113',
 'Klf6_109',
 'Lin54_224',
 'Mafa_55',
 'Mafb_50',
 'Mafb_53',
 'Mafb_55',
 'Mafg_53',
 'Mafg_55',
 'Mecom_241',
 'Meis1_69',
 'Meis2_69',
 'Meis3_69',
 'Mga_73',
 'Mlxipl_58',
 'Mxi1_59',
 'Myb_252',
 'Mybbp1a_252',
 'Mybl1_252',
 'Mybl1_255',
 'Mybl1_256',
 'Mybl2_252',
 'Mybl2_255',
 'Myc_59',
 'Mycbp2_59',
 'Mycn_59',
 'Neurod1_63',
 'Nfatc2_100',
 'Nfia_188',
 'Nfia_189',
 'Nfib_189',
 'Nfil3_52',
 'Nr2c2_30',
 'Nr5a2_40',
 'Nr6a1_40',
 'Onecut1_14',
 'Pax6_281',
 'Pax6os1_281',
 'Pbx1_12',
 'Pdx1_21',
 'Pou3f1_82',
 'Pou3f4_82',
 'Pou3f4_85',
 'Ppargc1b_32',
 'Prdm16_105',
 'Ptf1a_63',
 'Rara_40',
 'Rbpjl_165',
 'Relch_159',
 'Rell1_159',
 'Rest_151',
 'Rfx2_245',
 'Rfx2_247',
 'Rfx3_245',
 'Rfx3_247',
 'Rfx6_246',
 'Rora_32',
 'Rora_42',
 'Rora_48',
 'Runx1t1_179',
 'Rxra_33',
 'Rxra_44',
 'Rxrg_30',
 'Rxrg_33',
 'Rxrg_40',
 'Six4_180',
 'Sox4_89',
 'Sox5_91',
 'Sox6_89',
 'Sox9_190',
 'Sox9_191',
 'Sox9_202',
 'Sox9_89',
 'Sox9_92',
 'Sox9_93',
 'Sox9_94',
 'Stat3_171',
 'Tacc3_70',
 'Taf13_113',
 'Taf1d_113',
 'Taf1d_70',
 'Tagln2_70',
 'Tbc1d1_70',
 'Tbc1d31_70',
 'Tbc1d8_70',
 'Tbc1d9_70',
 'Tcf7l1_29',
 'Tcf7l1_70',
 'Tcf7l2_120',
 'Tcf7l2_29',
 'Tcf7l2_70',
 'Tdrd7_70',
 'Tead2_166',
 'Tead2_70',
 'Tecpr1_70',
 'Tenm3_70',
 'Tent5a_70',
 'Tgif1_69',
 'Tgif2_69',
 'Tgm2_70',
 'Thra_34',
 'Thsd4_70',
 'Tiam1_70',
 'Ticrr_70',
 'Tinagl1_70',
 'Tjp1_70',
 'Tk1_70',
 'Tlnrd1_70',
 'Tmcc3_70',
 'Tmed10_70',
 'Tmem108_70',
 'Tmem163_70',
 'Tmem164_70',
 'Tmem176b_70',
 'Tmem178b_70',
 'Tmem181a_70',
 'Tmem184a_70',
 'Tmem200a_70',
 'Tmod2_70',
 'Tmpo_70',
 'Tmsb10_70',
 'Tmsb15b2_70',
 'Tmsb4x_70',
 'Tmtc2_70',
 'Tmx1_70',
 'Tnik_70',
 'Tnr_70',
 'Tob1_70',
 'Top2a_70',
 'Topbp1_70',
 'Tox3_70',
 'Tpcn1_70',
 'Tpm3_70',
 'Tpr_70',
 'Tpst2_70',
 'Tpt1_70',
 'Tpx2_70',
 'Trim28_70',
 'Trim35_70',
 'Trim37_70',
 'Trim59_70',
 'Trim9_70',
 'Trip13_70',
 'Trp53_70',
 'Trp53i11_70',
 'Trp73_70',
 'Trpc1_70',
 'Trpm3_70',
 'Try10_70',
 'Try5_70',
 'Tshz1_70',
 'Tshz2_70',
 'Tshz3_70',
 'Tspan7_70',
 'Tspear_70',
 'Ttc28_70',
 'Ttf2_70',
 'Ttk_70',
 'Ttr_70',
 'Ttyh1_70',
 'Ttyh2_70',
 'Ttyh3_70',
 'Tuba1a_70',
 'Tuba1b_70',
 'Tubb2a_70',
 'Tubb3_70',
 'Tubb5_70',
 'Tuft1_70',
 'Tusc3_70',
 'Tut4_70',
 'Tyms_70',
 'Vdr_44',
 'Zbtb7c_262',
 'Zeb1_67',
 'Zeb1_68'}
res[res["module_name"].str.contains("HD")].sort_values("p_adj")
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj k
42 20 False 21 Pdx1 True 0.085723 8.687087e-15 HD/16 2.865996e-13 Pdx1_21
18 7 False 8 Arx True -0.081863 1.295229e-13 HD/8 3.845833e-12 Arx_8
1946 1 True 2 Isl1 True -0.075389 9.154130e-12 HD/2 2.222323e-10 Isl1_2
25 11 False 12 Pbx1 True 0.065727 2.795715e-09 HD/12 5.042739e-08 Pbx1_12
1971 18 True 19 Meis1 True 0.053219 1.506326e-06 HD/14 1.680467e-05 Meis1_19
... ... ... ... ... ... ... ... ... ... ...
1973 20 True 21 Pbx2 True -0.002056 8.526826e-01 HD/16 9.218578e-01 Pbx2_21
1933 1 True 2 Dlx1as True -0.001950 8.602075e-01 HD/2 9.268220e-01 Dlx1as_2
46 23 False 24 Arid3a True -0.001547 8.888475e-01 HD/18 9.415344e-01 Arid3a_24
12 1 False 2 Arid3b True 0.000642 9.537659e-01 HD/2 9.767940e-01 Arid3b_2
7 1 False 2 Mnx1 True -0.000469 9.662018e-01 HD/2 9.827507e-01 Mnx1_2

68 rows × 10 columns

mod_names_best = set(rkg_df.sort_values("scores", ascending=False).groupby("celltype").head(5)["module_name"])
best = rkg_df[rkg_df["module_name"].isin(mod_names_best)]
rcParams["figure.dpi"] = 130
sns.clustermap(
    best.pivot(index="celltype", columns="module_name", values="scores"),
    cbar_kws={"label": "activity"},
    cmap="RdBu_r",
    vmin=-5,
    vmax=5,
    figsize=[6.2, 5],
    # dpi=100,
    xticklabels=True,
)
<seaborn.matrix.ClusterGrid at 0x7fc897a76e20>
../../_images/fc1f337bfb169d95d5128c27b00f385747c9f052c45f0f044612457d4df196f9.png
rna_tfs = rna_sel.to_df()[list(set(res["gene_name"]))]
rna_tfs["celltype"] = rna_sel.obs["celltype"]
mean_tfs = rna_tfs.groupby("celltype").mean()

act_tfs_df = ad_act.to_df()
act_tfs_df["celltype"] = ad_act.obs["celltype"]
mean_act_tf = act_tfs_df.groupby("celltype").mean()
corr_celltype = []
for i, c1 in enumerate(mean_act_tf):
    if i % 50 == 0:
        print(i, mean_act_tf.shape[1])
    for j, c2 in enumerate(mean_tfs):
        if not c1 in genes_by_module_name or not c2 in genes_by_module_name[c1]:
            continue
        a = mean_act_tf[c1]
        b = mean_tfs[c2]
        corr_celltype.append([c1, c2, mean_act_tf.index[np.argmax(mean_act_tf[c1])]] + list(pearsonr(a, b)))

corr = pd.DataFrame(corr_celltype, columns=["module_name", "gene_name", "cell_type", "pearsonr", "p_val"])
corr = corr.sort_values("pearsonr", ascending=False)
0 288
50 288
100 288
150 288
200 288
250 288
rcParams['figure.dpi'] = 150
# activators
module_names = corr[corr['pearsonr'] > 0].sort_values('p_val').sort_values('p_val').groupby('cell_type').head(3)['module_name'].drop_duplicates()
gene_names   = corr[corr['pearsonr'] > 0].sort_values('p_val').sort_values('p_val').groupby('cell_type').head(3)['gene_name']
sc.pl.matrixplot(ad_act,
            groupby=cell_type_key,
            cmap='Blues',
            show=False,
            var_names=module_names,
            figsize=[4, 1.9],
            standard_scale='var',
            colorbar_title='mean activity in group')

plt.savefig('../../../output/filter_activities_pancreatic_endocrinogenesis.pdf')
plt.show()

sc.pl.dotplot(rna_sel,
              groupby=cell_type_key,
              standard_scale='var',
              show=False,
              colorbar_title='mean GEX',
              var_names=gene_names,
              figsize=[5, 1.9])

plt.savefig('../../../output/gene_expression_pancreatic_endocrinogenesis.pdf')
plt.show()
../../_images/6af5ff3f4ebe0e9286a76317d393a72da5d5f8e8186050ccf213f4ac630400eb.png ../../_images/2f48575d57d3e21b42cdc772ddc3f9b3e8a6389c48a188903e92ad83e1d244b3.png
module_names
166     CCAAT/CEBP
1467        NFAT/2
158     CREB/ATF/2
1517      KLF/SP/2
1546      GC-tract
602          TBX/1
1712         RFX/1
276           MIES
37           CUX/3
1358         FOX/4
109          NR/16
1449         ETS/1
70            NR/3
1621         FOX/7
36           CUX/1
1920       GMEB2/2
1527           EGR
1634        BCL6/2
1685         HD/23
132          NR/19
Name: module_name, dtype: object
module_names
166     CCAAT/CEBP
1467        NFAT/2
158     CREB/ATF/2
1517      KLF/SP/2
1546      GC-tract
602          TBX/1
1712         RFX/1
276           MIES
37           CUX/3
1358         FOX/4
109          NR/16
1449         ETS/1
70            NR/3
1621         FOX/7
36           CUX/1
1920       GMEB2/2
1527           EGR
1634        BCL6/2
1685         HD/23
132          NR/19
Name: module_name, dtype: object
module_names.map(res.set_index('module_name')['filter_id'].to_dict())
166      51
1467     98
158      49
1517    108
1546    112
602      69
1712    246
276      68
37       14
1358     78
109      44
1449     97
70       31
1621    163
36       13
1920    276
1527    110
1634    169
1685    203
132      47
Name: module_name, dtype: int64
module_names
166     CCAAT/CEBP
1467        NFAT/2
158     CREB/ATF/2
1517      KLF/SP/2
1546      GC-tract
602          TBX/1
1712         RFX/1
276           MIES
37           CUX/3
1358         FOX/4
109          NR/16
1449         ETS/1
70            NR/3
1621         FOX/7
36           CUX/1
1920       GMEB2/2
1527           EGR
1634        BCL6/2
1685         HD/23
132          NR/19
Name: module_name, dtype: object
anno[anno['Name'].str.contains('ETS')]
Cluster_ID Name DBD Seed_motif Total_width Consensus_left Consensus_right Cluster_size Name_unique
36 98 ETS/1 ETS EHF_ETS_1 25 7 17 71 ETS/1
37 96 ETS/2 ETS EHF_HUMAN.H11MO.0.B 18 3 14 24 ETS/2
name_by_filter_id = meta["archetypes_name"].to_dict()
name_by_filter_id[1]
'HD/1'
test_names = list(anno.sort_values('Cluster_ID')['Name'])[:1]
print(test_names)

pd.Series(test_names).map(res.set_index('module_name')['filter_id'].to_dict())
['HD/1']
0   NaN
dtype: float64
filter_id_by_name = {v: k for k, v in zip(name_by_filter_id.keys(), name_by_filter_id.values())}
for k in list(anno.sort_values('Cluster_ID')['Name']):
    if k != 'SOX/3':
        continue
    print(k)
    mb.pl.logo(
        model,
        title=False,
        xticks=False,
        rowspan_dinuc=0,
        rowspan_mono=1,
        n_rows=40,
        log_odds=True,
        # stop_at=11,
        show=False,
        # n_rows=len(res.head(20).index),
        n_cols=1,
        order=pd.Series([k]).map(filter_id_by_name) + 2,
    )  # n_cols=len(reduced_groups))
    # plt.tight_layout()
    plt.show()
SOX/3
../../_images/0f31904b1343508ad1da1224a44b5df1b80eceb9aaa3ad8ac8ccf719a898f537.png
custom_names = pd.Series(['MYB/5', 'MYB/1', 'FOX/4', 'NR/20', 'TBX/1', 'ETS/1', 'YY1', 'STAT/1', 'NR/17', 'HD/14'])
# custom_names = pd.Series(['FOX/4',])
custom_names.map(res.set_index('module_name')['filter_id'].to_dict()),
(0    251
 1    252
 2     78
 3    247
 4     69
 5     97
 6    144
 7    166
 8     45
 9     18
 dtype: int64,)
print(custom_names.map(res.set_index('module_name')['filter_id'].to_dict()))
mb.pl.set_rcParams({"figure.figsize": [2, 20], "figure.dpi": 90})
mb.pl.logo(
    model,
    title=False,
    xticks=False,
    rowspan_dinuc=0,
    rowspan_mono=1,
    n_rows=40,
    log_odds=True,
    # stop_at=11,
    show=False,
    # n_rows=len(res.head(20).index),
    n_cols=1,
    order=custom_names.map(filter_id_by_name) + 2,
)  # n_cols=len(reduced_groups))
# plt.tight_layout()

plt.savefig('../../../output/motif_pancreatic_endocrinogenesis_publication.pdf')
plt.show()
0    251
1    252
2     78
3    247
4     69
5     97
6    144
7    166
8     45
9     18
dtype: int64
../../_images/340dcca62f77e6cecec9f587990ef23e74730534ac7b47e4584b19a60320496a.png
custom_names = pd.Series(['GLI', 'FOX/5'])

print(custom_names.map(res.set_index('module_name')['filter_id'].to_dict()))
mb.pl.set_rcParams({"figure.figsize": [2, 20], "figure.dpi": 90})
mb.pl.logo(
    model,
    title=False,
    xticks=False,
    rowspan_dinuc=0,
    rowspan_mono=1,
    n_rows=40,
    log_odds=True,
    # stop_at=11,
    show=False,
    # n_rows=len(res.head(20).index),
    n_cols=1,
    order=custom_names.map(filter_id_by_name) + 2,
)  # n_cols=len(reduced_groups))
# plt.tight_layout()

plt.savefig('../../../output/motif_pancreatic_neurogenesis_publication_subset.pdf')
plt.show()
0    261
1     79
dtype: int64
../../_images/b302151474146753b656cab50e0e0b5408770c6c70a9c304bf46562e18a56a26.png
# repressors
module_names = corr[corr['pearsonr'] < 0].sort_values('p_val').sort_values('p_val').groupby('cell_type').head(3)['module_name'].drop_duplicates()
gene_names   = corr[corr['pearsonr'] < 0].sort_values('p_val').sort_values('p_val').groupby('cell_type').head(3)['gene_name']
sc.pl.matrixplot(ad_act,
            groupby=cell_type_key,
            cmap='Blues',
            var_names=module_names,
            figsize=[4, 1.9],
            standard_scale='var',
            colorbar_title='mean activity in group')
sc.pl.dotplot(rna_sel,
              groupby=cell_type_key,
              colorbar_title='mean GEX',
              standard_scale='var',
              var_names=gene_names,
              figsize=[4.5, 1.9])
../../_images/9450b2c44c94bf5962ef9d1102f00dc264df751d26a4f4baff6efae0923a89e5.png ../../_images/a33b2bd1b8b71529a4e0fbc12219bd5199d6647ec03f6756aae60f3590eb5c28.png
# sc.pl.dotplot(rna_sel, groupby='celltype', var_names=list(set(res['gene_name'])))
for ri, r in corr.sort_values("p_val").groupby("cell_type").head(3).iterrows():
    # ad_act.obs['HD/2'] = log_act['HD/2']
    sc.pl.dotplot(
        ad_act,
        groupby="celltype",
        cmap="Blues",
        var_names=r["module_name"],
        figsize=[2, 3.2],
        colorbar_title="mean activity in group",
    )
    sc.pl.dotplot(rna_sel, groupby="celltype", var_names=r["gene_name"], figsize=[2, 3.2])
../../_images/70ba1f93761f0526f38b8f1fff1dc88c4f4e0d119d42bf7040bfd477c054feed.png ../../_images/a282cfa3fa8e00ee578a2fbf9af21d9134969ebede813e3896b7df3eb28215ff.png ../../_images/21fe8be0775af945a12f29e2442d0087525e600bdcb0f0ec7e2afaef1d613979.png ../../_images/78b4dc27b0cff17561ea859fce0cf4097a6d7bb74b31c1e9794377f653b730a6.png ../../_images/124f6766c62a636fe76e7e4b79567033847f7e99ffb9b7bd2bd71f8777d0f9d0.png ../../_images/f87477146038f2d97d13e616803f4a21700bd006a5455bf22a9ebfd63b95417c.png ../../_images/5997cbd6e76397b05cbcc8b6d13d705ff7e85f67d17e10cb46070d96d699af21.png ../../_images/bed719de6ab492ae91bcb76eced7a12e35e4ea95e8fbfb2b6cf83c52bc219f11.png ../../_images/76b5da36452dae4e8edfe7ab17a3796db2256c5088ae8db3306eadfd1d1cfbc1.png ../../_images/7bf319d25572138299f7fe2ce7b1aaac40f6216f0a82fe1660c8ef5327ea4b07.png ../../_images/75be27d6b0a54091440b9a8d304c62e923e53152a9c28b5b9b288ffe94675907.png ../../_images/2599a9cd0188b49824bd0d08d5d02e2fd6a0981bedb8967c064dd346798ec983.png ../../_images/ddc8b5f2d819f559779f04e6cfe3700027ff9bad8f55eb04c40acbca950db5c1.png ../../_images/a53758651225e48525ae04e41da5b64a3c9af7a8e5b99d1d3ae3e39df5fdd1e5.png ../../_images/3da87a8301fb3f9ab7cb4ad9b15688bd0bd02c0d86578676e63842716f9171a7.png ../../_images/adb0f5f72dc13d9ab72bf5f46a9e62862d24218410fe96ddd7c3b38cb57b17bc.png ../../_images/77fdfac1234602206b178c161fbdd189f41803a3838d75bcb71de4fe55df8e1e.png ../../_images/c35be2c2ed0b8db293f37e153fd21cefdd32c16a0c7cda83444a768e14f3abbb.png ../../_images/6d9fcbfc1475b88f06544b240cef1898caa4c61dcfb2ff5f42cf5cbf99b5eaa8.png ../../_images/545d81abc6a1f01c0ee070a14caab381eeec87d5c794b1220b3ee8b873c6c179.png ../../_images/c7ed26241053d7fe208a341ddf9c99b97c6adcfdae88a77e949f02f38ab46b58.png ../../_images/c596bdbe7f43f87239b52366b8f85c0e5ae337b41d4c561fd7abe64da66e26a2.png ../../_images/8dd38668f5cea5ad4286c9df08841527e81c5e26fe5dbe3e6f0cd017a67bc756.png ../../_images/96bf2e91c666e1fbf6c314857561a0b4929619d7fbec156cff8029b719d1e653.png ../../_images/bb94e6c0383046dd50bcc75999d2f71954303768175b78897857b17239eb57be.png ../../_images/6e74ddae89f8f0e47cba797dcb7aebdc84b0d82957a0f1fd3b9c27e1f74ca74e.png ../../_images/51ef082f9525072be0d2ce408f53e52114499f20ca18a5b09e798ed1b4bb0a44.png ../../_images/810e80e9bb2f8f019e719d233385aef7dab8681d623ed9062f808a7f8053e8cc.png ../../_images/ee8356036df14b955c048b55981747ac923d0c3ecc25b50466020380577e2696.png ../../_images/8e3d31f0daa4ac71aea0cceb7939a9684ea4fb9c35e79548fe22f1c0d6eedfd8.png ../../_images/4f443dbee6ea874c497893f1a144d53bab212b148dbf66e6c52a32e315eacd3f.png ../../_images/6add125f8fbc21bc82b5e22d8a4e3e4275bd593df793921f39149c27723f253e.png ../../_images/a4a9b413837a614569b7a34761c361ca95b0ddad3eac2e9ca77a6f754eb01494.png ../../_images/8e454247e602a3b5eac1f118734ebdafd2a95b380c2c0b8d783dc9d5e030c8bb.png ../../_images/21fe8be0775af945a12f29e2442d0087525e600bdcb0f0ec7e2afaef1d613979.png ../../_images/89511a5a8a22c133f1a1cdf0f6387b0827a1e6812bf1d6049e8f4f43e9c781a7.png ../../_images/5794b6e7b1aa0bd155791ceabb9f090ce2ab5723e3a387a2620889fb99c45ca0.png ../../_images/7bf319d25572138299f7fe2ce7b1aaac40f6216f0a82fe1660c8ef5327ea4b07.png ../../_images/fde7e03ddd7d729050520674c88aa3be38500c5f69ce1c864825388a47af5a3e.png ../../_images/194719ad9dbdfc0a028e1fbf418a8802d1846dbdb9b761a7ed6aad14f3f8934d.png ../../_images/6f919018a36ae20a353727db02fdd88e63bfc0155b3b2b9bc2a002534bc4583d.png ../../_images/736e77df4fb399aa982b6caef6741951b8c952e8b4eee420f20e9a20ffe5a207.png

Filter activities versus graph activities

A = model.get_log_activities()
sum_A = A.abs().sum(axis=1).cpu().detach().numpy()
A = A.squeeze(0)
print(A.shape)
torch.Size([288, 8161])
# this function assesses the contributions of A on the graph
indices, contributions, max_eig = mb.tl.compute_contributions(A.cpu(), C.cpu(),  D.cpu())

contributions_normalized = torch.abs(contributions) / max_eig
contributions_df = pd.DataFrame(contributions_normalized.detach(), columns=["index"])
print("Summary statistics of the normalized contributions: \n")
contributions_df.describe()
Summary statistics of the normalized contributions: 
index
count 288.000000
mean 0.377668
std 0.044241
min 0.277876
25% 0.344783
50% 0.377110
75% 0.407360
max 0.496437
from matplotlib.pyplot import rcParams

rcParams["figure.dpi"] = 100
plt.figure(figsize=(10, 5))

print(f"Percentage of non-zero entries of the filter matrix A: {100 * torch.sum(A != 0).item() / A.numel()} %")

mb.pl.filter_contrib_simple(contributions_normalized, A.cpu())
Percentage of non-zero entries of the filter matrix A: 100.0 %
<Figure size 1000x500 with 0 Axes>
../../_images/06337e63dee0751e4d516cfff74ce23a55169ac581636262f9cd817ec459cc44.png
# normalize the data, and look at summary stats
sum_A_norm = sum_A / np.max(sum_A)
sum_A_df = pd.DataFrame(sum_A.T, columns=["sum_A"])
sum_A_df.describe()
sum_A
count 288.000000
mean 5.957136
std 8.597739
min 0.000547
25% 1.099398
50% 2.910393
75% 7.284939
max 68.284973
contrib_arr = contributions_normalized.unsqueeze(dim=0).detach().numpy()
sum_A = A.cpu().abs().sum(axis=1).detach().numpy()
contrib = contrib_arr[0]

contrib_times_activities = contrib * sum_A
contrib_times_activities_norm = contrib_times_activities / np.max(contrib_times_activities)
contrib_times_activities_df = pd.DataFrame(contrib_times_activities, columns=["contribution_times_activities"])
contrib_times_activities_df.describe()
contribution_times_activities
count 288.000000
mean 2.240525
std 3.305919
min 0.000212
25% 0.422303
50% 1.031057
75% 2.731761
max 28.479479
sum_A = A.cpu().abs().sum(axis=1)
sum_A_normalized = sum_A / sum_A.max()
mb.pl.filter_contrib_heatmap(sum_A_normalized, title='Sum of absolute values of log activities', score="Normalized Sum")
../../_images/e2cc531d9786a193b3b5c84cc2e4ce5a6a91abebfe523c67354238b727de2903.png
# this function assesses the contributions of A on the graph
indices, contributions, max_singular_value = mb.tl.compute_contributions(A.cpu(), C.cpu(), D.cpu(), use_hadamard=False)
contributions_normalized = torch.abs(contributions) / max_singular_value
contrib_np = contributions_normalized.detach().numpy()
sum_A_np = sum_A_normalized.detach().numpy()
rel_idcC = np.where(contrib_np >= np.percentile(contrib_np, 75))
rel_idcA = np.where(sum_A_np >= np.percentile(sum_A_np, 75))
print(rel_idcA)
print([x for x in rel_idcC[0] if x in rel_idcA[0]])

contrib_activites_prod = contributions_normalized * sum_A_normalized
print(sum(contrib_activites_prod >= np.sort(contrib_activites_prod.detach().numpy())[3*len(contrib_activites_prod) // 4])/len(contrib_activites_prod))
mb.pl.filter_contrib_heatmap(contrib_activites_prod, title="Product of contributions and activities", score="Product")
(array([  9,  18,  20,  41,  45,  46,  48,  53,  57,  81,  88,  93,  94,
       100, 105, 136, 150, 151, 153, 154, 160, 162, 165, 175, 179, 180,
       181, 182, 196, 199, 200, 203, 211, 212, 213, 219, 228, 232, 233,
       234, 235, 236, 239, 240, 242, 244, 245, 248, 249, 250, 251, 252,
       253, 255, 256, 259, 263, 264, 265, 267, 269, 271, 272, 273, 274,
       276, 279, 280, 281, 282, 284, 285]),)
[136, 160, 162, 181, 199, 212, 219, 228, 234, 248, 255, 256, 259, 263, 265, 267, 269, 271, 279, 281]
tensor(0.25)
../../_images/368a0173407669c79337f436715cc551626217c233cba1536a64d1e4f4217b08.png
from matplotlib.pyplot import rcParams
rcParams['figure.dpi'] = 100
plt.figure(figsize=(10, 5))


print(f"Percentage of non-zero entries of the filter matrix A: {100 * torch.sum(A != 0).item() / A.numel()} %")

mb.pl.filter_contrib_simple(contributions_normalized, A.cpu(), 'filter_contrib_plot.pdf')

plt.savefig('../../../output/contrib_scores_graph_pancreatic_endocrinogenesis.pdf')
Percentage of non-zero entries of the filter matrix A: 100.0 %
<Figure size 1000x500 with 0 Axes>
../../_images/f4867e7d433498f9e2b3f89a275346d4d181cbb12d264ca38931742c736b87e7.png
from matplotlib.pyplot import rcParams
rcParams['figure.dpi'] = 60

mb.pl.contrib_heatmaps(A.detach().cpu(), C.detach().cpu(), D.detach().cpu(), use_hadamard=False, save_pdf=True)
../../_images/25f8f5fee03a0d1f72999117e1c473a25f84ab3984ed37e07171c7d5221643cb.png
anno[anno['Name'] == 'TFAP2/1']
Cluster_ID Name DBD Seed_motif Total_width Consensus_left Consensus_right Cluster_size Name_unique
215 264 TFAP2/1 TFAP AP2A_HUMAN.H11MO.0.A 15 3 12 24 TFAP2/1
# grnpedia associations
ttrust = pd.read_csv('https://www.grnpedia.org/trrust/data/trrust_rawdata.mouse.tsv',
                     sep='\t',
                     header=None)
ttrust.columns = ['tf', 'target', 'role', 'ref']
ttrust = ttrust.groupby(['tf', 'role']).size().reset_index().pivot(index='tf', columns='role', values=0).fillna(0)
ttrust['consensus'] = np.where((ttrust['Activation'] > 0) & (ttrust['Repression'] == 0), 'red',
                               np.where((ttrust['Repression'] > 0) & (ttrust['Activation'] == 0), 'blue', 'purple'))
res[res['module_name'] == 'SOX/3']
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj k
1403 91 False 92 Sox13 True -0.019356 8.037954e-02 SOX/3 1.974952e-01 Sox13_92
1404 91 False 92 Sox8 True 0.013017 2.396776e-01 SOX/3 4.317106e-01 Sox8_92
1405 91 False 92 Sox30 True 0.022332 4.365840e-02 SOX/3 1.264612e-01 Sox30_92
1406 91 False 92 Sox11 True -0.000002 9.998756e-01 SOX/3 9.998756e-01 Sox11_92
1407 91 False 92 Sox10 True 0.004463 6.868651e-01 SOX/3 8.237248e-01 Sox10_92
1408 91 False 92 Sox12 True 0.004646 6.747168e-01 SOX/3 8.159169e-01 Sox12_92
1409 91 False 92 Sox9 True 0.112574 1.970287e-24 SOX/3 1.311260e-22 Sox9_92
3333 91 True 92 Sox13 True -0.003547 7.486992e-01 SOX/3 8.619085e-01 Sox13_92
3334 91 True 92 Sox8 True 0.002815 7.992993e-01 SOX/3 8.881103e-01 Sox8_92
3335 91 True 92 Sox30 True -0.003407 7.582867e-01 SOX/3 8.665619e-01 Sox30_92
3336 91 True 92 Sox11 True 0.024678 2.578970e-02 SOX/3 8.429149e-02 Sox11_92
3337 91 True 92 Sox10 True 0.011439 3.014702e-01 SOX/3 5.088216e-01 Sox10_92
3338 91 True 92 Sox12 True -0.023356 3.486919e-02 SOX/3 1.065360e-01 Sox12_92
3339 91 True 92 Sox9 True 0.037916 6.127002e-04 SOX/3 3.724445e-03 Sox9_92
# res_sel[res_sel['module_name'] == 'SOX/3']
from matplotlib.pyplot import rcParams
rcParams['figure.dpi'] = 80

# unsqueeze the data to make it compatible with the heatmap function
sum_A_norm = sum_A_norm.reshape(1,-1)
contrib_times_activities = contrib_times_activities.reshape(1,-1)
!pip install adjustText
Collecting adjustText
  Downloading adjustText-1.2.0-py3-none-any.whl.metadata (3.0 kB)
Requirement already satisfied: numpy in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from adjustText) (1.26.4)
Requirement already satisfied: matplotlib in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from adjustText) (3.8.4)
Requirement already satisfied: scipy in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from adjustText) (1.13.0)
Requirement already satisfied: contourpy>=1.0.1 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (1.2.1)
Requirement already satisfied: cycler>=0.10 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (4.51.0)
Requirement already satisfied: kiwisolver>=1.3.1 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (1.4.5)
Requirement already satisfied: packaging>=20.0 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (24.0)
Requirement already satisfied: pillow>=8 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (10.3.0)
Requirement already satisfied: pyparsing>=2.3.1 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (2.9.0)
Requirement already satisfied: importlib-resources>=3.2.0 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from matplotlib->adjustText) (6.4.0)
Requirement already satisfied: zipp>=3.1.0 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from importlib-resources>=3.2.0->matplotlib->adjustText) (3.17.0)
Requirement already satisfied: six>=1.5 in /home/ilibarra/.conda/envs/mubind/lib/python3.9/site-packages (from python-dateutil>=2.7->matplotlib->adjustText) (1.16.0)
Downloading adjustText-1.2.0-py3-none-any.whl (12 kB)
Installing collected packages: adjustText
Successfully installed adjustText-1.2.0
from adjustText import adjust_text
rcParams['figure.figsize'] = 3, 3
rcParams['figure.dpi'] = 135
rcParams['pdf.fonttype'] = 42

def delta_models(model_by_logdynamic):
    model = model_by_logdynamic[True]
    log_act1 = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)
    log_act1 = pd.DataFrame(log_act1.detach().cpu().numpy())

    model = model_by_logdynamic[False]
    log_act2 = torch.stack(list(model.activities.log_activities), dim=1).squeeze(0)
    log_act2 = pd.DataFrame(log_act2.detach().cpu().numpy())

    d = log_act1.mean(axis=0) - log_act2.mean(axis=0)
    # d.index = ad.obs_names
    return d

# graph = True - graph = False
delta_model = delta_models(model_by_logdynamic)
delta_model
delta_model = delta_model[2:]
delta_model.index = range(len(delta_model))
delta_model.index += 1
delta_model.index = delta_model.index.map(name_by_filter_id)
delta_model

res['act_change'] = res['module_name'].map(delta_model.to_dict())
res['k'] = res['module_name'] + ':' + res['gene_name']
res['graph_score'] = res['filter_id'].map({i : contrib_times_activities[0][i + 2] for i in range(len(contrib_times_activities[0]) - 2)})
res['z'] = np.sqrt((res['graph_score'] ** 2) * (-np.log(res['p_adj'] + 1e-10)) ** 2)
# only show weights with Graph Layer on
res_sel = res.copy() # [res['opt_log_dynamic']]
res.shape, res_sel.shape
((3860, 13), (3860, 13))
ax = plt.subplot()
cmap = sns.color_palette('RdBu_r', as_cmap=True)


res_sel = res_sel.sort_values('z', ascending=False) # .drop_duplicates('module_name')
# res_sel = res_sel.drop_duplicates('module_name')

# res_sel['x'] = np.log((res_sel['graph_score'] + 1) * (res_sel['act_change'].abs() + 1))
res_sel['x'] = res_sel['graph_score'] * res_sel['act_change'].abs() * res_sel['spearman'].abs()
# res_sel['x'] = res_sel['act_change'].abs() * res_sel['spearman'].abs()

# general noise stacking of cells on x-y axes
sigma = 0.001
mu =0.00
# generate normally distributed samples
noise = sigma * np.random.randn(res_sel.shape[0]) + mu
res_sel['x'] = res_sel['x'] + noise
res_sel['spearman'] = res_sel['spearman'] + noise


edgecolors = res_sel['gene_name'].map(ttrust['consensus'].to_dict())
edgecolors = np.where(~pd.isnull(edgecolors), edgecolors, 'gray')
res_sel['edgecolors'] = edgecolors

for show_log_dynamic in [True,]:
    res_plot = res_sel[res_sel['opt_log_dynamic'] == show_log_dynamic]
    plt.scatter(res_plot['x'],            
                res_plot['spearman'],
                s=-np.log(res_plot['p_adj'] + 1e-10),
                cmap=cmap,
                lw=.3,
                marker='o' if show_log_dynamic else '^',
                edgecolors='gray',
                c=res_plot['act_change'])

    plt.axhline(y=0, ls='--', c='gray')
    plt.xlabel('grapn_contrib * abs(act.) * corr(GEX, act)')
    plt.ylabel('Spearman(GEX, act)')


    res_plot = res_plot.sort_values('x', ascending=False).drop_duplicates('module_name')
    texts = [] # [plt.text(x[i], y[i], 'Text%s' %i, ha='center', va='center') for i in range(len(x))]

    for ri, r in res_plot.head(10).iterrows():
        print(r['module_name'] + ':' + r['gene_name'], (r['x'], r['spearman']), r['act_change'], r['edgecolors'])
        t = ax.annotate(r['module_name'] + ':' + r['gene_name'], (r['x'], r['spearman']), fontsize=7, color=r['edgecolors'])
        texts.append(t)
    adjust_text(texts, arrowprops=dict(arrowstyle='->'))

plt.savefig('../../../output/graph_contribution_pancreatic_endocrinogesis.pdf')
plt.show()
PROX1:Prox1 (0.1059767525471204, 0.028886475849371568) 0.16746491193771362 purple
CCAAT/CEBP:Dbpht2 (0.08536277658217145, -0.10255250768984316) 0.10485882312059402 gray
MYB/2:Mybl1 (0.07734918164531444, 0.05428451893348915) 0.12227752804756165 purple
NR/11:Nr5a2 (0.06523281984085992, 0.03952495182055314) 0.21196624636650085 red
GLI:Zbtb7c (0.06397115296691377, -0.05706246215981071) 0.09551192820072174 gray
SOX/3:Sox9 (0.05609969940570946, 0.03852411756917732) 0.1335025131702423 purple
TFAP2/1:Ap2a1 (0.055899889740226597, -0.019106576198307353) 0.15999287366867065 gray
GRHL:Grhl2 (0.03513197853410249, 0.04331429681436185) 0.08097979426383972 gray
SOX/4:Sox4 (0.03209149718337514, 0.044195799745043826) 0.08338265866041183 purple
FOX/5:Foxo1 (0.028284020278165116, -0.05766508859818134) 0.07688487321138382 purple
../../_images/b4797dee21f5e080ebf939909f113fe0af6f76922dfba4cd6e018f9d5c44f058.png
mb.pl.set_rcParams({"figure.figsize": [2, 20], "figure.dpi": 90})
mb.pl.logo(
    model,
    log_odds=True,
    title=False,
    xticks=False,
    rowspan_dinuc=0,
    rowspan_mono=1,
    n_rows=40,
    # stop_at=11,
    show=False,
    # n_rows=len(res.head(20).index),
    n_cols=1,
    order=pd.Series(['SOX/3', 'NR/11', 'CCAAT/CEBP']).map(res.set_index('module_name')['filter_id'].to_dict()),
)  # n_cols=len(reduced_groups))
# plt.tight_layout()

plt.savefig('../../../output/motif_pancreatic_endocrinogenesis.pdf')
plt.show()
../../_images/18445954e0ecc6b9a47dd7cc6253515f2daa2e0b757005b8b234f2db3d0faac2.png
import pylab as pl
import numpy as np

a = np.array([[res_sel['act_change'].min(), res_sel['act_change'].max()]])
pl.figure(figsize=(3, .2))
img = pl.imshow(a, cmap="RdBu_r")
pl.gca().set_visible(False)
cax = pl.axes([0.1, 0.2, 0.8, 0.6])
pl.colorbar(orientation="horizontal", cax=cax)
pl.savefig("../../../output/graph_contribution_pancreatic_endocrinogenesis_cbar.pdf")
../../_images/15e78bf00aa272f96b2f6a8c768c7a3e69137b81f2363040b12c7b34ed9bd14c.png
scv.tl.velocity_pseudotime(rna)
# scv.pl.scatter(rna, color="velocity_pseudotime", color_map="gnuplot")
computing terminal states
    identified 10 regions of root cells and 1 region of end points .
    finished (0:00:03) --> added
    'root_cells', root cells of Markov diffusion process (adata.obs)
    'end_points', end points of Markov diffusion process (adata.obs)
def running_mean(y_in, x_in, N_out=101, sigma=.05):
    '''
    Returns running mean as a Bell-curve weighted average at evenly spaced
    points. Does NOT wrap signal around, or pad with zeros.
    
    Arguments:
    y_in -- y values, the values to be smoothed and re-sampled
    x_in -- x values for array
    
    Keyword arguments:
    N_out -- NoOf elements in resampled array.
    sigma -- 'Width' of Bell-curve in units of param x .
    '''
    import numpy as np
    N_in = len(y_in)

    # Gaussian kernel
    x_out = np.linspace(np.min(x_in), np.max(x_in), N_out)
    x_in_mesh, x_out_mesh = np.meshgrid(x_in, x_out)
    gauss_kernel = np.exp(-np.square(x_in_mesh - x_out_mesh) / (2 * sigma**2))
    # Normalize kernel, such that the sum is one along axis 1
    normalization = np.tile(np.reshape(np.sum(gauss_kernel, axis=1), (N_out, 1)), (1, N_in))
    gauss_kernel_normalized = gauss_kernel / normalization
    # Perform running average as a linear operation
    y_out = gauss_kernel_normalized @ y_in

    return y_out, x_out
# prepare velocity pseudotime values for visualization
atac.obs['velocity_pseudotime'] = rna.obs['velocity_pseudotime'].values
ad.obs['velocity_pseudotime'] = atac.obs['velocity_pseudotime']

Given a few potential genes, verify its association with chromatin targets

def plot_pseudotime(gene_name, filter_name, sigma_gex=.05, sigma_filter=.1, save=None):
    rcParams['figure.figsize'] = 3, 2
    gene_key = gene_name
    x = rna.obs['velocity_pseudotime']
    y = rna[:,rna.var_names==gene_key].X.A.flatten()
    y = np.log(y)
    y[y == -np.inf] = np.nanmin(y[y != -np.inf])

    y_mean, x_mean = running_mean(y, x, sigma=sigma_gex)
    plt.scatter(x, y, edgecolors=None, color='lightgreen', s=.1)
    plt.plot(x_mean, y_mean, color='green')
    plt.ylabel('')
    plt.title(gene_key)
    plt.ylabel('gene expression [log]')
    plt.xlabel('pseudotime')


    x = ad.obs['velocity_pseudotime']

    if save is not None:
        plt.savefig(save + '_gex.pdf')
    plt.show()
    filter_id = int(res[res['module_name'].str.contains(filter_name)]['filter_id'].values[0])

    y= act.T.numpy()[:,filter_id]
    # y = np.abs(y)
    # y = np.log(y)
    # y[y == -np.inf] = np.nanmin(y[y != -np.inf])

    y_mean, x_mean = running_mean(y, x, sigma=sigma_filter)
    # plt.scatter(x, y, edgecolors='black', color='lightgreen', s=.1)
    plt.plot(x_mean, y_mean, color='red')
    plt.ylabel('')
    plt.title('filter activities %s' % filter_name)
    plt.ylabel('filter activity')
    plt.xlabel('pseudotime')

    if save is not None:
        plt.savefig(save + '_filter.pdf')

    # plt.close()
    plt.show()
# plot_pseudotime('Foxa2', 'FOXA/1', sigma_gex=.01, sigma_filter=.1,
#                 save="../../../output/pancreas_mafa_bzip")
plot_pseudotime('Mafa', 'AP1/2', sigma_gex=.01, sigma_filter=.1,
                save="../../../output/pancreas_mafa_bzip")
../../_images/184e7059c886a4a8c627648644096ebd0b62e2d4ecf55261cb50a77f7f47051a.png ../../_images/42136b03e5a18a8ba59682695e9a65da69a929289040b146b6334f0e099c3504.png
plt.rcParams["font.family"] = 'sans-serif' # None # "Arial"

plot_pseudotime('Sox9', 'SOX/3', sigma_gex=.01, sigma_filter=.1,
                save="../../../output/pancreas_sox9_sox3")

plot_pseudotime('Cux1', 'CUX/2', sigma_gex=.01, sigma_filter=.1,
                save="../../../output/pancreas_cux1_cux2")
../../_images/4434dfc34d8984ca97ca1c60b32c755fa65997aebfa6f3ea4f657d78f006d0ca.png ../../_images/f6b870b3ee6b438c89234c7c0f072bbccdff329bb965f0d16ae21b6da0315f07.png ../../_images/a04d76e7863c8a44fadbc8361e201aada8872bc3180abff2140c8618440da03c.png ../../_images/eec6ec414501dc0ee627d7610a663e6b1872e67a4303cbd38aff0f899a4be5fb.png
plot_pseudotime('Insm1', 'INSM1', sigma_gex=.01, sigma_filter=.05,
                save="../../../output/pancreas_CCAAT_CEBP_dbpht2")
../../_images/cb696add8d2770daf6c1025cc9945c013934e78bef72e7c0c6508b0e05e4d3ef.png ../../_images/ce483acc6a7e5dff7c697bbfbaa4683bdf0f9008a2188bcd3c471b1d7a48a5ce.png
plot_pseudotime('Pax6', 'PAX/1', sigma_gex=.01, sigma_filter=.05,
                save="../../../output/pancreas_CCAAT_CEBP_dbpht2")
../../_images/ce4be92f8022c65bd0862f6dd1e6aa3a2b6276c6eed555f20744b9cf8d2b694d.png ../../_images/b2fb38190e6d34e9681586e1bf960290d377b2ef39a45a493bb3dd2d7ff8cca3.png
res
filter_id opt_log_dynamic archetype_id gene_name matched spearman p_val module_name p_adj k act_change graph_score z
0 1 False 2 Pou6f1 True -0.012835 2.463036e-01 HD/2 4.397465e-01 HD/2:Pou6f1 -0.033147 2.135972 1.754823
1 1 False 2 Pou6f2 True -0.004708 6.706317e-01 HD/2 8.131867e-01 HD/2:Pou6f2 -0.033147 2.135972 0.441707
2 1 False 2 Nkx6-1 True 0.019021 8.575862e-02 HD/2 2.067634e-01 HD/2:Nkx6-1 -0.033147 2.135972 3.366677
3 1 False 2 Dlx1as True -0.011318 3.066163e-01 HD/2 5.133309e-01 HD/2:Dlx1as -0.033147 2.135972 1.424340
4 1 False 2 Lmx1b True -0.011210 3.112775e-01 HD/2 5.183482e-01 HD/2:Lmx1b -0.033147 2.135972 1.403565
... ... ... ... ... ... ... ... ... ... ... ... ... ...
3855 280 True 281 Pax6 True -0.080454 3.368429e-13 PAX/1 9.490611e-12 PAX/1:Pax6 0.045139 4.515195 103.556830
3856 280 True 281 Pax6os1 True -0.041961 1.495984e-04 PAX/1 1.075326e-03 PAX/1:Pax6os1 0.045139 4.515195 30.861954
3857 282 True 283 Pax2 True -0.003662 7.408379e-01 PAX-halfsite 8.568437e-01 PAX-halfsite:Pax2 0.024079 4.426369 0.683873
3858 283 True 284 Ahr True 0.023966 3.038693e-02 AHR 9.556552e-02 AHR:Ahr 0.099753 11.301592 26.535496
3859 284 True 285 Klf8 True -0.019348 8.050853e-02 KLF/SP/3 1.976863e-01 KLF/SP/3:Klf8 -0.028565 2.733040 4.430459

3860 rows × 13 columns

plot_pseudotime('Cebpa', 'CCAAT/CEBP', sigma_gex=.01, sigma_filter=.05,
                save="../../../output/pancreas_CCAAT_CEBP_dbpht2")
../../_images/f98c2100cee22fc45218de89e3cf4c09e3ac99c362f845cffa89bf5d79b3041d.png ../../_images/df63d63a38d7a007ea176b9ce6f41fab9b11128b639b9d5b29f7cbb19ef4549d.png
plot_pseudotime('Dbpht2', 'CCAAT/CEBP', sigma_gex=.01, sigma_filter=.05,
                save="../../../output/pancreas_CCAAT_CEBP_dbpht2")
../../_images/ebf84cae0baac494bdbe9a24e037f0e35b27addb477e61810d443853577dbdab.png ../../_images/df63d63a38d7a007ea176b9ce6f41fab9b11128b639b9d5b29f7cbb19ef4549d.png
plot_pseudotime('Nr5a2', 'NR/11', sigma_gex=.01, sigma_filter=.05,
                save="../../../output/pancreas_nr5a2_nr11")
../../_images/0c49af20a4e6aa184649e7b52c33e72bc3106d1a259ec1748bb168ad636042a7.png ../../_images/c455f4dd421db8ef470b59be405dfb6faeee43ce2935104a459867d052699346.png
def plot_chromatin(filter_name, sigma_chrom,
                   group_key=None, key_sel=None,
                   q_thr=.95, show=True, save=None):
    res_sel = res[res['module_name'].str.contains(filter_name)]['filter_id']
    if res_sel.shape[0] == 0:
        print('not found')
        return None
    filter_id = int(res_sel.values[0])
    # print(filter_id)
    scores = binding_per_mode[:,filter_id]
    # filter scores across regions
    # plt.hist(scores)

    if group_key is not None and key_sel is not None:
        ad_sel = ad[ad.obs[group_key] == key_sel]
    else:
        ad_sel = ad
    # print('here')

    var_idx = np.argwhere(scores >= np.quantile(scores, q_thr)).flatten()
    var_names = ad_sel.var_names[var_idx]
    # print(q_thr, var_names.shape)
    # background with all peaks
    var_names_neg = ad_sel.var_names[~ad.var_names.isin(var_names)]
    
    x = ad_sel.obs['velocity_pseudotime']
    y = ad_sel[:,var_names].X.mean(axis=1).A.flatten()
    # y = np.log(y)
    # y[y == -np.inf] = np.nanmin(y[y != -np.inf])
    y_mean, x_mean = running_mean(y, x, sigma=sigma_chrom)
    plt.scatter(x, y, edgecolors=None, color='lightblue', s=.1)
    plt.plot(x_mean, y_mean, color='blue', label='targets')
    plt.ylabel('')
    plt.title(filter_name + ((', %s' % key_sel) if key_sel is not None else ''))
    plt.ylabel('chromatin accessibility [mean]')
    plt.xlabel('pseudotime')
    # plt.show()

    # print(q_thr, var_names_neg.shape)
    x = ad_sel.obs['velocity_pseudotime']
    y = ad_sel[:,var_names_neg].X.mean(axis=1).A.flatten()
    # y = np.log(y)
    # y[y == -np.inf] = np.nanmin(y[y != -np.inf])
    y_mean_avr, x_mean = running_mean(y, x, sigma=sigma_chrom)
    # plt.scatter(x, y, edgecolors=None, color='lightblue', s=.1)
    plt.plot(x_mean, y_mean_avr, color='gray', linestyle='--', label='non-targets')
    plt.ylabel('')
    plt.ylabel('chromatin accessibility [mean]')
    plt.xlabel('pseudotime')
    plt.legend()

    if save is not None:
        plt.savefig(save + '_filter.pdf')
    if not show:
        plt.close()
    else:
        plt.show()
    return (y_mean - y_mean_avr).sum()
sigma_chrom = .1
# plot_chromatin('INSM1', sigma_chrom, group_key='celltype',
#                # key_sel='Beta',
#                q_thr=.86, show=True, save=None)
plot_chromatin('AP1/2', sigma_chrom, group_key='celltype',
               # key_sel='Beta',
               q_thr=.84, show=True, save=None)
../../_images/308b60a7cc9595060b3713b65d8c16b85e484df7b280ae24871d8f964af9a6c3.png
0.08973802337474156
plot_chromatin('AP1/2', sigma_chrom, group_key='celltype', key_sel='Beta', q_thr=.84, show=True, save=None)
../../_images/cb32736b58b98f6456af05d3764d58104874c0352449899ebb3ba9080dc20198.png
0.22364762461104337
plot_pseudotime('Mafa', 'AP1/2', sigma_gex=.01, sigma_filter=.1,
                save="../../../output/pancreas_mafa_bzip")
../../_images/184e7059c886a4a8c627648644096ebd0b62e2d4ecf55261cb50a77f7f47051a.png ../../_images/42136b03e5a18a8ba59682695e9a65da69a929289040b146b6334f0e099c3504.png
plot_chromatin('AP1/2', sigma_chrom, group_key='celltype', q_thr=.95, show=True, save='../../../output/pancreas_ap1_1')
../../_images/99dd7adb926639142f6fe6cf069f9c8a69c08404a5efc3e4d346218b1f7ce99e.png
0.18744242662916832
sigma_chrom
0.1
rcParams['figure.figsize'] = 3, 2
plot_chromatin('SOX/3', .04, q_thr=.95, show=True, save='../../../output/pancreas_sox3')
../../_images/a6dc970cc1f3967b7e22de1d3d18765244877a0441f6fe99f2132354acd83d41.png
-0.9421069034107382
# plt.close()
plot_chromatin('NR/11', sigma_chrom, q_thr=.9, show=True, save='../../../output/pancreas_nr11')
plot_chromatin('CCAAT/CEBP', sigma_chrom, q_thr=.9, show=True, save='../../../output/pancreas_ccaat')
39
here
0.9 (5000,)
0.9 (45000,)
../../_images/6ec0f5ba541af8148b4aaa62122cfc132ca93e366332bfb74fd893ff39d3318f.png
51
here
0.9 (5000,)
0.9 (45000,)
../../_images/7a2fe51c718716cd1bba12505129f489e52bf48703d6bff1edde20e4b779f77c.png
0.0034247913106122754
anno[anno['Name'] == 'CCAAT/CEBP']
Cluster_ID Name DBD Seed_motif Total_width Consensus_left Consensus_right Cluster_size Name_unique
9 52 CCAAT/CEBP bZIP CEBPA_HUMAN.H11MO.0.A 14 2 11 37 CCAAT/CEBP
plot_chromatin('NR/20', sigma_chrom, q_thr=.9, show=True, save='../../../output/pancreas_ccaat')
247
here
0.9 (5000,)
0.9 (45000,)
../../_images/d07e01227cdb5ffe6881b806b2a66d591f28f53e0724752185a0bd86056e0672.png
0.08752305422098165
module_names = set(res['module_name'])

diff_pseudotime_by_module = []
for name in module_names:
    sigma_chrom = .03
    diff = plot_chromatin(name, sigma_chrom, q_thr=.99, show=False)
    diff_pseudotime_by_module.append([name, diff])
    print(name, diff)
# filter_name = 'SOX/4'
# sigma_chrom = .03
# plot_chromatin(filter_name, sigma_chrom, q_thr=.9)

# filter_name = 'SOX/3'
# sigma_chrom = .03
# plot_chromatin(filter_name, sigma_chrom, q_thr=.84)

# filter_name = 'NR/11'
# sigma_chrom = .02
# plot_chromatin(filter_name, sigma_chrom, q_thr=.95)

# filter_name = 'PAX/1'
# sigma_chrom = .03
# plot_chromatin(filter_name, sigma_chrom, q_thr=.99)
chrom = pd.DataFrame(diff_pseudotime_by_module, columns=['module', 'diff'])
Ebox/CAGATGG -0.4463687105064015
ETS/2 1.3841551210348784
NFKB/3 -0.029245373729375405
HIC/1 -0.00970106750741416
SOX/4 -0.3864617779577727
ZIC/2 -0.001678147728974927
NRF1 -1.2149584090135126
Ebox/CACGTG/2 0.7070329100372075
GC-tract -0.352908705694854
AIRE -0.039780363278249894
NR/16 0.3818001510819134
CCAAT/CEBP -1.0215220720248988
ZNF28 -1.054292635566007
HD/25 -0.3858228067283669
HD/18 0.3678626404024407
BATF 0.8496393614191722
YY1 -0.6374268295097316
SOX/3 -0.9099718303214782
Ebox/CACGTG/1 -0.7772530907996553
EBF1 -0.011737208270374451
HLTF -1.4430974820992746
STAT/2 0.623677270427765
NR/5 -0.42165050996851433
SNAI2 -0.029305342214546663
SMAD 2.878415515952871
E2F/3 0.6617957710935988
CPEB1 0.7573968834971179
MYB/1 0.5095461009114091
SREBF1 -0.011737208270374451
SOX/6 0.9916724448590001
POU/3 -0.8682258245449043
CENBP 2.0772878400552517
NFAT/1 -0.34609716253812495
CREB/ATF/1 0.9182118987611131
CREB/ATF/3 -0.5941427179558985
CUX/2 -0.6188109979961864
BCL6/1 1.2951623121745122
TBX/1 -1.3559940045587469
HINFP1/1 -1.5303903573608182
MZF1 -0.269106254500437
FOX/3 0.12013514531092945
SOX/2 -1.2940562392934727
TFCP2 0.8031191300914624
MYB/4 0.9103189912274473
CUX/1 1.1215119488121088
CUX/3 0.29181685167803084
KLF/SP/1 -0.17357936918552327
RFX/2 0.3521208815920952
Ebox/CACCTG -0.5955704048416309
THAP1 -1.9091206461738328
CTCF 0.44687916088007046
MEF2 0.1635619585574609
MYB/5 0.4092221525942972
AP1/2 -0.506526387942535
Ebox/CATATG 0.45124144500690305
PRDM5 -0.6680187729554607
FOX/8 0.0076614966463977655
E2F/2 -0.32548537360772734
NR/18 -0.5466794047176766
RFX/1 -0.6461101872451352
GRHL -1.3244764670712648
NFKB/1 1.2090884408490437
NFKB/2 0.7544892817965362
ZNF652 -0.5338940794041547
AHR -0.33197457694038185
AP1/1 -0.3268777906682992
MFZ1 0.7677125318222324
KLF/SP/2 -0.31925204287720027
MTF1 -0.3258488261698437
ZBTB6 -0.10000325671211306
HD/12 0.9101176561806534
NR/13 0.4560247083688173
IRF/2 0.9093056990396887
ZNF143 0.5363586702724903
SPDEF/2 -0.4574884274506151
SOX/7 -0.1153157149687983
GMEB2/1 -0.6271663119194868
NFAT/3 0.07432299166620979
NFY -1.6461554626977613
ARI5A 0.4251164076398737
TFAP2/1 -0.9052591695452773
MAF 0.4594316204324637
NR/4 -0.6653371118495084
NFI/1 -0.2783801401994053
MYB/2 -0.6362792614752368
ZNF57 -1.1678015258122054
NR/12 1.0894008039296366
SPI -0.6719526048927651
RFX/3 0.5670336899417951
ZNF354 0.3525071993381236
PROX1 1.1441262017983782
EVI1/MECOM -0.29520508316697247
SIX/1 0.07510179285768168
GMEB2/3 0.5913462726776078
TEAD -0.4197771402411455
NR/9 -0.2572733766197498
ZFX -1.7060761730650642
RUNX/1 -0.8441002081772531
FOX/4 -0.9445048565787458
HIF 2.988408540447653
ZBTB49 4.61222487160048
SOX/8 -0.5897541586694709
NR/15 1.5481448063153085
NR/7 -1.269722056372812
RBPJ 0.08975539408227676
HD/2 0.3759806619383275
GFI 0.7317949107776258
IRF/4 1.419556142926688
HOMEZ 0.5250855589346416
RUNX/2 -0.22911510471596072
SPDEF/1 1.1989329740969454
MBD2 -1.0711031111034617
PAX/1 0.18441886935943502
HD/23 -0.3200864666275924
TFAP2/2 -0.01604517405709123
CUX/4 0.22015045232037822
GATA -0.4314637860847894
NR/8 -0.8030136673458672
FOX/2 0.11775062419502871
MIES -0.8343979180887662
STAT/1 -1.5314400032238316
ZNF335 -0.9634653239907858
NFAT/2 -0.4506339818594068
TBX/3 1.1035565572051376
REST/NRSF 0.32063332120127175
PRDM4 1.0778408489240316
not found
DDIT3+CEBPA None
NR/19 -0.3005369734642561
FOX/5 -0.04673500355458831
REL-halfsite 0.4696085724406287
SRF 0.6294870391308862
PRDM1 1.5176165664667574
HSF -0.28191747989649496
IRF/3 -0.3206631772555406
KAISO -0.7270061463859222
INSM1 -0.8884641435669692
NR/10 1.4369850871875558
NR/17 -0.39727253302943316
POU/1 -0.34026029460175766
FOX/6 0.8488835522295665
E2F/1 1.4304759897485635
TBX/2 0.40736994566822715
TCF/LEF -0.7589635724686599
NFAT/4 -0.1414585941453083
MECP2 2.353791634013215
LEF1 2.325734770600728
Ebox/CAGCTG -0.007602971200395749
PRDM9 -0.1529234092127609
POU/2 0.908282790551908
SOX/1 -2.4064246885340195
KLF/SP/3 -1.8180654670276561
NR/1 -0.8211190329059841
NR/6 0.9937804791238484
FOX/7 -0.4386088817687715
GMEB2/2 0.03700352721580165
NFI/2 0.2031417942876867
ETS/1 -0.49967628311850165
GLI 0.26015841392168393
SOX/5 -0.7648320115736118
TBX/4 0.841269354310839
GLIS 0.26015841392168393
NR/11 -0.6398790374906249
MYB/3 -1.0622625244751394
HD/14 -0.8802136657485213
HD/16 0.4504262671789152
CREB/ATF/2 0.8406829665624733
EGR -0.22523820468322242
IRF/1 1.8354912001501464
TATA -0.20443965986626844
E2F/4 -0.27389699491361663
NFI/3 0.8159721782938699
NR/3 0.6205652809645038
OVOL1 0.7590182676318638
PAX-halfsite -0.912170224212366
BCL6/2 5.462366565318233
CREB3/XBP1 0.4618577576952723
not found
OCT4+SOX2 None
PLAG1 -0.062016821623390166
HD/10 -0.13492179651312536
NR/20 -0.38186017057189237
HD/8 0.4624420522161476
LIN54 0.5553670719942171