Source code for elpigraph.utils

import numpy as np
import pandas as pd
import networkx as nx
import scipy
import networkx as nx
from .src.core import PartitionData
from .src.reporting import project_point_onto_graph
from sklearn.neighbors import NearestNeighbors, KernelDensity


[docs]def getProjection(X, PG):
    """Compute graph projection from principal graph dict
    (result stored in PG dict under 'projection' key)
    """

    G = nx.Graph()
    G.add_edges_from(PG["Edges"][0].tolist(), weight=1)
    mat_conn = nx.to_scipy_sparse_array(
        G, nodelist=np.arange(len(PG["NodePositions"])), weight="weight"
    )

    # partition points
    node_id, node_dist = PartitionData(
        X=X,
        NodePositions=PG["NodePositions"],
        MaxBlockSize=len(PG["NodePositions"]) ** 4,
        SquaredX=np.sum(X ** 2, axis=1, keepdims=1),
    )
    # project points onto edges
    dict_proj = project_point_onto_graph(
        X=X,
        NodePositions=PG["NodePositions"],
        Edges=PG["Edges"][0],
        Partition=node_id,
    )

    PG["projection"] = {}
    PG["projection"]["node_id"] = node_id.flatten()
    PG["projection"]["node_dist"] = node_dist
    PG["projection"]["edge_id"] = dict_proj["EdgeID"].astype(int)
    PG["projection"]["edge_loc"] = dict_proj["ProjectionValues"]
    PG["projection"]["X_projected"] = dict_proj["X_projected"]
    PG["projection"]["conn"] = mat_conn
    PG["projection"]["edge_len"] = dict_proj["EdgeLen"]
    PG["projection"]["MSEP"] = dict_proj["MSEP"]


[docs]def getPseudotime(
    X, PG, source, target=None, nodes_to_include=None, project=True
):
    """Compute pseudotime given
    source: int
        source node
    target: int
        optional target node
    nodes_to_include: list
        optional nodes to include in the path
        (useful for complex topologies with loops,
        where multiple paths are possible between 2 nodes)
    project: bool
        if False, will save computation time by using the projection already stored in PG dict (computed using elpigraph.utils.getProjection)
    """
    if project is True:
        getProjection(X, PG)
    elif project is False and not ("projection" in PG.keys()):
        raise ValueError(
            "key 'projection not found in PG. To use a precomputed projection"
            " run elpigraph.utils.getProjection"
        )

    epg_edge = PG["Edges"][0]
    epg_edge_len = PG["projection"]["edge_len"]
    G = nx.Graph()
    edges_weighted = list(zip(epg_edge[:, 0], epg_edge[:, 1], epg_edge_len))
    G.add_weighted_edges_from(edges_weighted, weight="len")
    if target is not None:
        if nodes_to_include is None:
            # nodes on the shortest path
            nodes_sp = nx.shortest_path(
                G, source=source, target=target, weight="len"
            )
        else:
            assert isinstance(
                nodes_to_include, list
            ), "`nodes_to_include` must be list"
            # lists of simple paths, in order from shortest to longest
            list_paths = list(
                nx.shortest_simple_paths(
                    G, source=source, target=target, weight="len"
                )
            )
            flag_exist = False
            for p in list_paths:
                if set(nodes_to_include).issubset(p):
                    nodes_sp = p
                    flag_exist = True
                    break
            if not flag_exist:
                return f"no path that passes {nodes_to_include} exists"
    else:
        nodes_sp = [source] + [v for u, v in nx.bfs_edges(G, source)]
    G_sp = G.subgraph(nodes_sp).copy()
    index_nodes = {
        x: nodes_sp.index(x) if x in nodes_sp else G.number_of_nodes()
        for x in G.nodes
    }

    if target is None:
        dict_dist_to_source = nx.shortest_path_length(
            G_sp, source=source, weight="len"
        )
    else:
        dict_dist_to_source = dict(
            zip(
                nodes_sp,
                np.cumsum(
                    np.array(
                        [0.0]
                        + [
                            G.get_edge_data(nodes_sp[i], nodes_sp[i + 1])[
                                "len"
                            ]
                            for i in range(len(nodes_sp) - 1)
                        ]
                    )
                ),
            )
        )

    cells = np.isin(PG["projection"]["node_id"], nodes_sp)
    id_edges_cell = PG["projection"]["edge_id"][cells].tolist()
    edges_cell = PG["Edges"][0][id_edges_cell, :]
    len_edges_cell = PG["projection"]["edge_len"][id_edges_cell]

    # proportion on the edge
    prop_edge = np.clip(PG["projection"]["edge_loc"][cells], a_min=0, a_max=1)

    dist_to_source = []
    for i in np.arange(edges_cell.shape[0]):
        if index_nodes[edges_cell[i, 0]] > index_nodes[edges_cell[i, 1]]:
            dist_to_source.append(dict_dist_to_source[edges_cell[i, 1]])
            prop_edge[i] = 1 - prop_edge[i]
        else:
            dist_to_source.append(dict_dist_to_source[edges_cell[i, 0]])
    dist_to_source = np.array(dist_to_source)
    dist_on_edge = len_edges_cell * prop_edge
    dist = dist_to_source + dist_on_edge

    PG["pseudotime"] = np.repeat(np.nan, len(X))
    PG["pseudotime"][cells] = dist
    PG["pseudotime_params"] = {
        "source": source,
        "target": target,
        "nodes_to_include": nodes_to_include,
    }


#### supervised knn


def _longform_knn_to_sparse(dis, idx):
    row_ind = np.tile(np.arange(len(idx))[:, None], idx.shape[1])
    col_ind = idx
    return scipy.sparse.csr_matrix((dis.flat, (row_ind.flat, col_ind.flat)))


[docs]def getWeights(
    X, bandwidth=1, griddelta=100, exponent=1, method="sklearn", **kwargs
):
    """Get point weights as the inverse density of data
    X: np.array, (n_sample x n_dims)
    bandwidth: sklearn KernelDensity bandwidth if method == 'sklearn'
    griddelta: FFTKDE grid step size if method =='fft'
    exponent: density values are raised to the power of exponent
    """
    if method == "sklearn":
        kde = KernelDensity(bandwidth=bandwidth, **kwargs
        ).fit(X)
        scores = kde.score_samples(X)
        scores = np.exp(scores)[:, None]

    elif method == "fft":
        import KDEpy

        kde = KDEpy.FFTKDE(bw=bandwidth,**kwargs).fit(X)
        x, y = kde.evaluate(griddelta)
        scores = scipy.interpolate.griddata(x, y, X)

    p = 1 / (scores ** exponent)
    p /= np.sum(p)
    return p


def ordinal_neighbors_stagewise_longform(
    X, stages_labels, stages=None, k=15, radius=None, m="cosine"
):
    """Supervised (ordinal) nearest-neighbor search.
    Stages is an ordered list of stages labels (low to high). If None, taken as np.unique(stages_labels)"""

    if stages is None:
        stages = np.unique(stages_labels)

    knn_distances = np.zeros((len(X), 3 * k))
    knn_idx = np.zeros((len(X), 3 * k), dtype=int)

    nn_stage = {}
    for s in stages:
        nn_stage[s] = NearestNeighbors(n_neighbors=k, metric=m).fit(
            X[stages_labels.values == s, :]
        )

    s = []
    t = []
    w = []
    for i in range(len(stages) - 1):
        dis, ind = nn_stage[stages[i]].kneighbors(
            X[stages_labels.values == stages[i + 1], :], k
        )
        knn_distances[stages_labels.values == stages[i + 1], :k] = dis
        knn_idx[stages_labels.values == stages[i + 1], :k] = np.where(
            stages_labels.values == stages[i]
        )[0][ind]

    for i in range(1, len(stages)):
        dis, ind = nn_stage[stages[i]].kneighbors(
            X[stages_labels.values == stages[i - 1], :], k
        )
        knn_distances[stages_labels.values == stages[i - 1], k : 2 * k] = dis
        knn_idx[stages_labels.values == stages[i - 1], k : 2 * k] = np.where(
            stages_labels.values == stages[i]
        )[0][ind]

    for i in range(len(stages)):
        if i == 0:
            dis, ind = nn_stage[stages[i]].kneighbors(
                X[stages_labels.values == stages[i], :], 2 * k + 1
            )
            dis, ind = dis[:, 1:], ind[:, 1:]
            knn_distances[
                np.argwhere(stages_labels.values == stages[i]),
                list(range(k)) + list(range(2 * k, 3 * k)),
            ] = dis
            knn_idx[
                np.argwhere(stages_labels.values == stages[i]),
                list(range(k)) + list(range(2 * k, 3 * k)),
            ] = np.where(stages_labels.values == stages[i])[0][ind]

        elif i == (len(stages) - 1):
            dis, ind = nn_stage[stages[i]].kneighbors(
                X[stages_labels.values == stages[i], :], 2 * k + 1
            )
            dis, ind = dis[:, 1:], ind[:, 1:]
            knn_distances[stages_labels.values == stages[i], k:] = dis
            knn_idx[stages_labels.values == stages[i], k:] = np.where(
                stages_labels.values == stages[i]
            )[0][ind]

        else:
            dis, ind = nn_stage[stages[i]].kneighbors(
                X[stages_labels.values == stages[i], :], k + 1
            )
            dis, ind = dis[:, 1:], ind[:, 1:]
            knn_distances[stages_labels.values == stages[i], 2 * k :] = dis
            knn_idx[stages_labels.values == stages[i], 2 * k :] = np.where(
                stages_labels.values == stages[i]
            )[0][ind]

    _sort = np.argsort(knn_distances, axis=1)
    knn_distances = knn_distances[
        np.arange(len(knn_distances))[:, None], _sort
    ]
    knn_idx = knn_idx[np.arange(len(knn_distances))[:, None], _sort]

    return knn_distances, knn_idx


def ordinal_neighbors_longform(
    X, stages_labels, stages=None, k=15, m="cosine"
):
    """Supervised (ordinal) nearest-neighbor search.
    Stages is an ordered list of stages labels (low to high). If None, taken as np.unique(stages_labels)"""
    if stages is None:
        stages = np.unique(stages_labels)

    knn_distances = np.zeros((len(X), k))
    knn_idx = np.zeros((len(X), k), dtype=int)
    for i in range(len(stages)):

        sel_points = (
            (stages_labels.values == stages[i])
            | (stages_labels.values == stages[max(0, i - 1)])
            | (stages_labels.values == stages[min(i + 1, len(stages) - 1)])
        )

        stage = stages_labels.values == stages[i]
        dis, ind = (
            NearestNeighbors(n_neighbors=k, metric=m)
            .fit(X[sel_points, :])
            .kneighbors(X[stage, :])
        )

        knn_distances[stage, :] = dis
        knn_idx[stage, :] = np.where(sel_points)[0][ind]
    return knn_distances, knn_idx


def supervised_knn(
    X,
    stages_labels,
    stages=None,
    n_neighbors=15,
    n_natural=0,
    m="cosine",
    method="force",
    return_sparse=False,
):
    """Supervised (ordinal) nearest-neighbor search.
    Stages is an ordered list of stages labels (low to high). If None, taken as np.unique(stages_labels)

    Parameters
    ----------
    method : str (default='force')
        if 'force', searches for each point at stage[i] n_neighbors nearest_neighbors, forcing:
            - n_neighbors/3 to be from stage[i-1]
            - n_neighbors/3 to be from stage[i]
            - n_neighbors/3 to be from stage[i+1]
            For stage[0] and stage[-1], 2*n_neighbors/3 are taken from stage[i]

        if 'guide', searches for each point at stage[i] n_neighbors nearest_neighbors
            from points in {stage[i-1], stage[i], stage[i+1]}, without constraints on proportions
    """

    if n_neighbors % 3 != 0:
        raise ValueError("Please provide n_neighbors divisible by 3")
    if stages is None:
        stages = np.unique(stages_labels)

    dis, idx = (
        NearestNeighbors(n_neighbors=n_neighbors, metric=m, n_jobs=8)
        .fit(X)
        .kneighbors()
    )

    if method == "guide":
        knn_distances, knn_idx = ordinal_neighbors_longform(
            X, stages_labels, stages=stages, k=n_neighbors, m=m
        )
    if method == "force":
        knn_distances, knn_idx = ordinal_neighbors_stagewise_longform(
            X, stages_labels, stages=stages, k=n_neighbors // 3, m=m
        )

    # ---mix natural nn with ordinal nn
    merged_idx = np.zeros((len(X), n_neighbors), dtype=np.int32)
    merged_dists = np.zeros((len(X), n_neighbors))
    for i in range(len(X)):
        merged_idx[i][:n_natural] = idx[i][:n_natural]
        merged_idx[i][n_natural:] = np.setdiff1d(
            knn_idx[i], idx[i][:n_natural], assume_unique=True
        )[: n_neighbors - n_natural]

        merged_dists[i][:n_natural] = dis[i][:n_natural]
        merged_dists[i][n_natural:] = knn_distances[i][
            np.isin(
                knn_idx[i],
                np.setdiff1d(
                    knn_idx[i], idx[i][:n_natural], assume_unique=True
                )[: n_neighbors - n_natural],
            )
        ]

    if return_sparse:
        return _longform_knn_to_sparse(merged_dists, merged_idx)
    else:
        return merged_dists, merged_idx


def geodesic_pseudotime(X, k, root, g=None):
    """pseudotime as graph distance from root point"""
    if g is None:
        nn = NearestNeighbors(n_neighbors=k, n_jobs=8).fit(X)
        g = nx.convert_matrix.from_scipy_sparse_array(
            nn.kneighbors_graph(mode="distance")
        )
    else:
        g = nx.convert_matrix.from_scipy_sparse_array(g)
    if len(list(nx.connected_components(g))) > 1:
        raise ValueError(
            f"detected more than 1 components with k={k} neighbors. Please"
            " increase k"
        )
    lengths = nx.single_source_dijkstra_path_length(g, root)
    pseudotime = np.array(pd.Series(lengths).sort_index())
    return pseudotime


def proj2embedding(X_elpi,X_embed,NodePositions):
    part,dis = PartitionData(X_elpi,NodePositions,50000,np.sum(X_elpi**2,axis=1,keepdims=1))
    # Hard assigment
    R = scipy.sparse.csr_matrix(
        (np.ones(len(X_elpi)), (range(len(X_elpi)), part.flat)), (len(X_elpi),len(NodePositions))
    ).A
    proj = (np.dot(X_embed.T, R) / R.sum(axis=0)).T
    return proj


def _propagate_labels(Xs,Xt,ys,flavor,n_neighbors,reg_e=0.01,reg_m=0.08):
    if 'ot' in flavor:
        try: import ot
        except: raise ImportError('Install Python OT to use this function'+
            '(pip install POT or conda install -c conda-forge pot)')
            
    if flavor=='ot':
        coupling = ot.sinkhorn([], [], ot.da.cost_normalization(ot.dist(Xs,Xt),'max'),reg=reg_e,numItermax=1000000)
    elif flavor=='ot_unbalanced':
        coupling = ot.sinkhorn_unbalanced([], [], ot.da.cost_normalization(ot.dist(Xs,Xt),'max'),reg_e,reg_m,numItermax=1000000)
    elif flavor=='ot_equal':
        ws=np.ones(len(ys))
        ws[ys==0]/=sum(ys==0)
        ws[ys==1]/=sum(ys==1)
        ws/=sum(ws)
        coupling = ot.sinkhorn(ws, [], ot.da.cost_normalization(ot.dist(Xs,Xt),'max'), reg=reg_e,numItermax=1000000)
    elif flavor=='knn':
        coupling = NearestNeighbors(n_neighbors=n_neighbors).fit(Xs).kneighbors_graph(Xt).toarray().T
    else: 
        raise ValueError('method must be ot, ot_unbalanced, ot_equal, knn')
        
    labels=[]
    for j,coup in enumerate(coupling.T):
        nz=coup>0
        labels.append(np.bincount(ys[nz],weights=coup[nz].flat).argmax())
        #labels.append(ys[coup.argmax()])

    return labels

def early_groups(X, PG, branch_nodes, source, target, nodes_to_include = None,
                 flavor='ot_unbalanced', n_windows = 20, n_neighbors=20, ot_reg_e=0.01,ot_reg_m=0.001):

    getPseudotime(X, PG, source, target=target, nodes_to_include=nodes_to_include, project=True)
    ix=~np.isnan(PG['pseudotime'])
    
    y = -np.ones(len(X),dtype=int)

    bnodes_ix = np.zeros(len(X),dtype=bool)
    for bn in branch_nodes:
        nix = PG['projection']['node_id']==bn
        bnodes_ix[nix] = True
        y[nix] = bn
        
    Xpath, ypath = X[ix].copy(), y[ix].copy() # points from path 
    Xbranch, ybranch = X[bnodes_ix].copy(), y[bnodes_ix].copy() # points from branch
    
    #propagate labels
    bpseudotime = PG['pseudotime'][ix]
    count, bins = np.histogram(bpseudotime, bins=n_windows)
    clusters = np.digitize(bpseudotime, bins[1:], right=True)

    #init 
    cnext = clusters==clusters.max()
    ypath[cnext] = _propagate_labels(Xs=Xbranch,Xt=Xpath[cnext],ys=ybranch,flavor=flavor,n_neighbors=n_neighbors)

    for c in np.arange(1,n_windows)[::-1]:
        cprev = clusters==c
        cnext = clusters==(c-1)
        ypath[cnext] = _propagate_labels(Xs=Xpath[cprev],Xt=Xpath[cnext],ys=ypath[cprev],
                                    flavor=flavor,n_neighbors=n_neighbors,reg_e=ot_reg_e,reg_m=ot_reg_m)
        
    s = '-'.join(str(x) for x in branch_nodes)
    
    y_groups = y.copy()
    y_groups[ix] = ypath
    PG[f'early_groups_{source}->{s}'] = y_groups.astype(str)
    
    y_clus = y.astype(str)
    y_clus[ix] = ['c'+c for c in clusters.astype(str)]
    PG[f'early_groups_{source}->{s}_clusters'] = y_clus.copy()

def residuals(X, Xr, nodep, r2_threshold=.5):

    partition = PartitionData(
        X=Xr,
        NodePositions=nodep,
        MaxBlockSize=100000,
        TrimmingRadius=np.inf,
        SquaredX=np.sum(Xr ** 2, axis=1, keepdims=1),
    )[0]

    means, residue_matrix, r2scores = _residuals_matrix(X, partition.flatten(), len(nodep))
    ind = np.where(r2scores > r2_threshold)[0]
    return (means, residue_matrix, r2scores, ind)

def _residuals_matrix(X, partition, n_nodes):
    # Building mapping partition -> set of points
    inds = [[] for _ in range(n_nodes)]
    for i in range(X.shape[0]):
        inds[partition[i]].append(i)
    if any([len(ind) == 0 for ind in inds]):
        print("Warning: empty nodes.")

    # Computing barycenter of each cluster
    means = np.array(
        [
            (np.mean(X[ind, :], axis=0) if len(ind) > 0 else np.zeros((X.shape[1],)))
            for ind in inds
        ]
    )

    # Computing residuals
    residue_matrix = means[partition, :]
    r2_scores = np.var(residue_matrix, axis=0) / np.var(X, axis=0)
    
    return((means, residue_matrix, r2_scores))