Source code for ibeis.algo.hots.tmp_cluster

[docs]def flow():
    """
    http://pmneila.github.io/PyMaxflow/maxflow.html#maxflow-fastmin

    pip install PyMaxFlow
    pip install pystruct
    pip install hdbscan
    """
    # Toy problem representing attempting to discover names via annotation
    # scores

    import pystruct  # NOQA
    import pystruct.models  # NOQA
    import networkx as netx  # NOQA

    import vtool as vt
    num_annots = 10
    num_names = num_annots
    hidden_nids = np.random.randint(0, num_names, num_annots)
    unique_nids, groupxs = vt.group_indices(hidden_nids)

    toy_params = {
        True: {'mu': 1.0, 'sigma': 2.2},
        False: {'mu': 7.0, 'sigma': .9}
    }

    if True:
        import plottool as pt
        xdata = np.linspace(0, 100, 1000)
        tp_pdf = vt.gauss_func1d(xdata, **toy_params[True])
        fp_pdf = vt.gauss_func1d(xdata, **toy_params[False])
        pt.plot_probabilities([tp_pdf, fp_pdf], ['TP', 'TF'], xdata=xdata)

    def metric(aidx1, aidx2, hidden_nids=hidden_nids, toy_params=toy_params):
        if aidx1 == aidx2:
            return 0
        rng = np.random.RandomState(int(aidx1 + aidx2))
        same = hidden_nids[int(aidx1)] == hidden_nids[int(aidx2)]
        mu, sigma = ut.dict_take(toy_params[same], ['mu', 'sigma'])
        return np.clip(rng.normal(mu, sigma), 0, np.inf)

    pairwise_aidxs = list(ut.iprod(range(num_annots), range(num_annots)))
    pairwise_labels = np.array([hidden_nids[a1] == hidden_nids[a2] for a1, a2 in pairwise_aidxs])
    pairwise_scores = np.array([metric(*zz) for zz in pairwise_aidxs])
    pairwise_scores_mat = pairwise_scores.reshape(num_annots, num_annots)
    if num_annots <= 10:
        print(ut.repr2(pairwise_scores_mat, precision=1))

    #aids = list(range(num_annots))
    #g = netx.DiGraph()
    #g.add_nodes_from(aids)
    #g.add_edges_from([(tup[0], tup[1], {'weight': score}) for tup, score in zip(pairwise_aidxs, pairwise_scores) if tup[0] != tup[1]])
    #netx.draw_graphviz(g)
    #pr = netx.pagerank(g)

    X = pairwise_scores
    Y = pairwise_labels

    encoder = vt.ScoreNormalizer()
    encoder.fit(X, Y)
    encoder.visualize()

    # meanshift clustering
    import sklearn
    bandwidth = sklearn.cluster.estimate_bandwidth(X[:, None])  # , quantile=quantile, n_samples=500)
    assert bandwidth != 0, ('bandwidth is 0. Cannot cluster')
    # bandwidth is with respect to the RBF used in clustering
    #ms = sklearn.cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=True)
    ms = sklearn.cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True, cluster_all=False)
    ms.fit(X[:, None])
    label_arr = ms.labels_
    unique_labels = np.unique(label_arr)
    max_label = max(0, unique_labels.max())
    num_orphans = (label_arr == -1).sum()
    label_arr[label_arr == -1] = np.arange(max_label + 1, max_label + 1 + num_orphans)

    X_data = np.arange(num_annots)[:, None].astype(np.int64)

    #graph = pystruct.models.GraphCRF(
    #    n_states=None,
    #    n_features=None,
    #    inference_method='lp',
    #    class_weight=None,
    #    directed=False,
    #)

    import scipy
    import scipy.cluster
    import scipy.cluster.hierarchy

    thresh = 2.0
    labels = scipy.cluster.hierarchy.fclusterdata(X_data, thresh, metric=metric)
    unique_lbls, lblgroupxs = vt.group_indices(labels)
    print(groupxs)
    print(lblgroupxs)
    print('groupdiff = %r' % (ut.compare_groupings(groupxs, lblgroupxs),))
    print('common groups = %r' % (ut.find_grouping_consistencies(groupxs, lblgroupxs),))
    #X_data, seconds_thresh, criterion='distance')

    #help(hdbscan.HDBSCAN)

    import hdbscan
    alg = hdbscan.HDBSCAN(metric=metric, min_cluster_size=1, p=1, gen_min_span_tree=1, min_samples=2)
    labels = alg.fit_predict(X_data)
    labels[labels == -1] = np.arange(np.sum(labels == -1)) + labels.max() + 1
    unique_lbls, lblgroupxs = vt.group_indices(labels)
    print(groupxs)
    print(lblgroupxs)
    print('groupdiff = %r' % (ut.compare_groupings(groupxs, lblgroupxs),))
    print('common groups = %r' % (ut.find_grouping_consistencies(groupxs, lblgroupxs),))

    #import ddbscan
    #help(ddbscan.DDBSCAN)
    #alg = ddbscan.DDBSCAN(2, 2)

    #D = np.zeros((len(aids), len(aids) + 1))
    #D.T[-1] = np.arange(len(aids))

    ## Can alpha-expansion be used when the pairwise potentials are not in a grid?

    #hidden_ut.group_items(aids, hidden_nids)
    if False:
        import maxflow
        #from maxflow import fastmin
        # Create a graph with integer capacities.
        g = maxflow.Graph[int](2, 2)
        # Add two (non-terminal) nodes. Get the index to the first one.
        nodes = g.add_nodes(2)
        # Create two edges (forwards and backwards) with the given capacities.
        # The indices of the nodes are always consecutive.
        g.add_edge(nodes[0], nodes[1], 1, 2)
        # Set the capacities of the terminal edges...
        # ...for the first node.
        g.add_tedge(nodes[0], 2, 5)
        # ...for the second node.
        g.add_tedge(nodes[1], 9, 4)
        g = maxflow.Graph[float](2, 2)
        g.maxflow()
        g.get_nx_graph()
        g.get_segment(nodes[0])