Source code for ibeis.algo.hots.smk.smk_index

# -*- coding: utf-8 -*-
"""
smk_index
This module contains functions for the SelectiveMatchKernels's inverted index.

TODO::
    * Test suit 1000k images
    * Extend for SMK with labels
    * Test get numbers and refine
    * Extrnal keypoint specific weighting
"""
from __future__ import absolute_import, division, print_function
#import six
import utool  # NOQA
import utool as ut
#import weakref
import numpy as np
import six  # NOQA
import pyflann
#import pandas as pd
from six.moves import zip, map, range  # NOQA
from vtool import clustering2 as clustertool
from ibeis.algo.hots import hstypes
from ibeis.algo.hots.smk import smk_scoring
from ibeis.algo.hots.smk import smk_residuals
(print, print_, printDBG, rrr, profile) = ut.inject(__name__, '[smk_index]')

USE_CACHE_WORDS = not ut.get_argflag('--nocache-words')
WITH_TOTALTIME = True


#@ut.memprof
@profile
[docs]def learn_visual_words(ibs, config2_=None, use_cache=USE_CACHE_WORDS, memtrack=None):
    """
    Computes and caches visual words

    Args:
        ibs (?):
        qreq_ (QueryRequest):  query request object with hyper-parameters
        use_cache (bool):  turns on disk based caching(default = True)
        memtrack (None): (default = None)

    Returns:
        ndarray[uint8_t, ndim=2]: words -  aggregate descriptor cluster centers

    Returns:
        words

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, taids, daids, qaids, qreq_, nWords = smk_debug.testdata_dataframe()
        >>> use_cache = True
        >>> words = learn_visual_words(ibs, qreq_)
        >>> print(words.shape)
        (8000, 128)

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> import ibeis
        >>> ibs = ibeis.opendb('PZ_Master1')
        >>> config2_ = ibs.new_query_params(cfgdict=dict(nWords=128000))
        >>> use_cache = True
        >>> words = learn_visual_words(ibs, config2_)
        >>> print(words.shape)
        (8000, 128)

    Auto:
        from ibeis.algo.hots.smk import smk_index
        import utool as ut
        argdoc = ut.make_default_docstr(smk_index.learn_visual_words)
        print(argdoc)
    """
    #if memtrack is None:
    #    memtrack = ut.MemoryTracker('[learn_visual_words]')
    #config2_ = qreq_.get_external_data_config2()
    nWords = config2_.nWords
    # TODO: Incorporated taids (vocab training ids) into qreq
    if config2_.vocab_taids == 'all':
        taids = ibs.get_valid_aids(species=ibs.get_primary_database_species())  # exemplar
    else:
        taids = config2_.vocab_taids
    initmethod   = config2_.vocab_init_method
    max_iters    = config2_.vocab_nIters
    flann_params = config2_.vocab_flann_params
    train_vecs_list = ibs.get_annot_vecs(taids, eager=True, config2_=config2_)
    #memtrack.track_obj(train_vecs_list[0], 'train_vecs_list[0]')
    #memtrack.report('loaded trainvecs')
    train_vecs = np.vstack(train_vecs_list)
    #memtrack.track_obj(train_vecs, 'train_vecs')
    #memtrack.report('stacked trainvecs')
    del train_vecs_list
    print('[smk_index] Train Vocab(nWords=%d) using %d annots and %d descriptors' %
          (nWords, len(taids), len(train_vecs)))
    kwds = dict(max_iters=max_iters, use_cache=use_cache,
                initmethod=initmethod, appname='smk',
                flann_params=flann_params)
    words = clustertool.cached_akmeans(train_vecs, nWords, **kwds)
    del train_vecs
    del kwds
    #memtrack.report('returning words')
    #del train_vecs_list
    return words


@profile
[docs]def assign_to_words_(wordflann, words, idx2_vec, nAssign, massign_alpha,
                     massign_sigma, massign_equal_weights):
    """
    Assigns descriptor-vectors to nearest word.

    Args:
        wordflann (FLANN): nearest neighbor index over words
        words (ndarray): vocabulary words
        idx2_vec (ndarray): descriptors to assign
        nAssign (int): number of words to assign each descriptor to
        massign_alpha (float): multiple-assignment ratio threshold
        massign_sigma (float): multiple-assignment gaussian variance
        massign_equal_weights (bool): assign equal weight to all multiassigned words

    Returns:
        tuple: inverted index, multi-assigned weights, and forward index
        formated as::

            * wx2_idxs - word index   -> vector indexes
            * wx2_maws - word index   -> multi-assignment weights
            * idf2_wxs - vector index -> assigned word indexes

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, qreq_ = smk_debug.testdata_raw_internals0()
        >>> words  = invindex.words
        >>> wordflann = invindex.wordflann
        >>> idx2_vec  = invindex.idx2_dvec
        >>> nAssign = qreq_.qparams.nAssign
        >>> massign_alpha = qreq_.qparams.massign_alpha
        >>> massign_sigma = qreq_.qparams.massign_sigma
        >>> massign_equal_weights = qreq_.qparams.massign_equal_weights
        >>> _dbargs = (wordflann, words, idx2_vec, nAssign, massign_alpha, massign_sigma, massign_equal_weights)
        >>> wx2_idxs, wx2_maws, idx2_wxs = assign_to_words_(*_dbargs)
    """
    if ut.VERBOSE:
        print('[smk_index.assign] +--- Start Assign vecs to words.')
        print('[smk_index.assign] * nAssign=%r' % nAssign)
    if not ut.QUIET:
        print('[smk_index.assign] assign_to_words_. len(idx2_vec) = %r' % len(idx2_vec))
    # Assign each vector to the nearest visual words
    assert nAssign > 0, 'cannot assign to 0 neighbors'
    try:
        _idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec, nAssign)
    except pyflann.FLANNException as ex:
        ut.printex(ex, 'probably misread the cached flann_fpath=%r' % (wordflann.flann_fpath,))
        raise
    _idx2_wx.shape    = (idx2_vec.shape[0], nAssign)
    _idx2_wdist.shape = (idx2_vec.shape[0], nAssign)
    if nAssign > 1:
        idx2_wxs, idx2_maws = compute_multiassign_weights_(
            _idx2_wx, _idx2_wdist, massign_alpha, massign_sigma, massign_equal_weights)
    else:
        idx2_wxs = _idx2_wx.tolist()
        idx2_maws = [[1.0]] * len(idx2_wxs)

    # Invert mapping -- Group by word indexes
    jagged_idxs = ([idx] * len(wxs)for idx, wxs in enumerate(idx2_wxs))
    wx_keys, groupxs = clustertool.jagged_group(idx2_wxs)
    idxs_list = clustertool.apply_jagged_grouping(jagged_idxs, groupxs)
    maws_list = clustertool.apply_jagged_grouping(idx2_maws, groupxs)
    wx2_idxs = dict(zip(wx_keys, idxs_list))
    wx2_maws = dict(zip(wx_keys, maws_list))
    if ut.VERBOSE:
        print('[smk_index.assign] L___ End Assign vecs to words.')

    return wx2_idxs, wx2_maws, idx2_wxs


@profile
[docs]def compute_multiassign_weights_(_idx2_wx, _idx2_wdist, massign_alpha,
                                 massign_sigma, massign_equal_weights):
    """
    Multi Assignment Filtering from Improving Bag of Features

    Args:
        _idx2_wx ():
        _idx2_wdist ():
        massign_alpha ():
        massign_sigma ():
        massign_equal_weights (): Turns off soft weighting. Gives all assigned
            vectors weight 1

    Returns:
        tuple : (idx2_wxs, idx2_maws)

    References:
        (Improving Bag of Features)
        http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf

        (Lost in Quantization)
        http://www.robots.ox.ac.uk/~vgg/publications/papers/philbin08.ps.gz

        (A Context Dissimilarity Measure for Accurate and Efficient Image Search)
        https://lear.inrialpes.fr/pubs/2007/JHS07/jegou_cdm.pdf

    Notes:
        sigma values from \cite{philbin_lost08}
        (70 ** 2) ~= 5000,
        (80 ** 2) ~= 6250,
        (86 ** 2) ~= 7500,

    Auto:
        from ibeis.algo.hots.smk import smk_index
        import utool as ut; print(ut.make_default_docstr(smk_index.compute_multiassign_weights_))
    """
    if not ut.QUIET:
        print('[smk_index.assign] compute_multiassign_weights_')
    # Valid word assignments are beyond fraction of distance to the nearest word
    massign_thresh = _idx2_wdist.T[0:1].T.copy()
    # HACK: If the nearest word has distance 0 then this threshold is too hard
    # so we should use the distance to the second nearest word.
    flag_too_close = (massign_thresh == 0)
    massign_thresh[flag_too_close] = _idx2_wdist.T[1:2].T[flag_too_close]
    # Compute the threshold fraction
    np.add(.001, massign_thresh, out=massign_thresh)
    np.multiply(massign_alpha, massign_thresh, out=massign_thresh)
    invalid = np.greater_equal(_idx2_wdist, massign_thresh)
    if ut.VERBOSE:
        _ = (invalid.size - invalid.sum(), invalid.size)
        print('[smk_index.assign] + massign_alpha = %r' % (massign_alpha,))
        print('[smk_index.assign] + massign_sigma = %r' % (massign_sigma,))
        print('[smk_index.assign] + massign_equal_weights = %r' % (massign_equal_weights,))
        print('[smk_index.assign] * Marked %d/%d assignments as invalid' % _)

    if massign_equal_weights:
        # Performance hack from jegou paper: just give everyone equal weight
        masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
        idx2_wxs  = list(map(ut.filter_Nones, masked_wxs.tolist()))
        #ut.embed()
        if ut.DEBUG2:
            assert all([isinstance(wxs, list) for wxs in idx2_wxs])
        idx2_maws = [np.ones(len(wxs), dtype=np.float32) for wxs in idx2_wxs]
    else:
        # More natural weighting scheme
        # Weighting as in Lost in Quantization
        gauss_numer = -_idx2_wdist.astype(np.float64)
        gauss_denom = 2 * (massign_sigma ** 2)
        gauss_exp   = np.divide(gauss_numer, gauss_denom)
        unnorm_maw = np.exp(gauss_exp)
        # Mask invalid multiassignment weights
        masked_unorm_maw = np.ma.masked_array(unnorm_maw, mask=invalid)
        # Normalize multiassignment weights from 0 to 1
        masked_norm = masked_unorm_maw.sum(axis=1)[:, np.newaxis]
        masked_maw = np.divide(masked_unorm_maw, masked_norm)
        masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
        # Remove masked weights and word indexes
        idx2_wxs  = list(map(ut.filter_Nones, masked_wxs.tolist()))
        idx2_maws = list(map(ut.filter_Nones, masked_maw.tolist()))
        #with ut.EmbedOnException():
        if ut.DEBUG2:
            checksum = [sum(maws) for maws in idx2_maws]
            for x in np.where([not ut.almost_eq(val, 1) for val in checksum])[0]:
                print(checksum[x])
                print(_idx2_wx[x])
                print(masked_wxs[x])
                print(masked_maw[x])
                print(massign_thresh[x])
                print(_idx2_wdist[x])
            #all([ut.almost_eq(x, 1) for x in checksum])
            assert all([ut.almost_eq(val, 1) for val in checksum]), 'weights did not break evenly'

    return idx2_wxs, idx2_maws


#@ut.cached_func('smk_idf', appname='smk', key_argx=[1, 2, 3], key_kwds=['daid2_label'])
@profile
[docs]def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids, daid2_label=None,
                      vocab_weighting='idf', verbose=False):
    """
    Computes the inverse-document-frequency weighting for each word

    Args:
        wx_series ():
        wx2_idxs ():
        idx2_aid ():
        daids ():
        daid2_label ():
        vocab_weighting ():

    Returns:
        wx2_idf

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids)
        >>> result = str(len(wx2_idf))
        >>> print(result)
        8000

    Ignore:
        #>>> wx2_idxs = invindex.wx2_idxs


    Auto:
        from ibeis.algo.hots.smk import smk_index
        import utool as ut; print(ut.make_default_docstr(smk_index.compute_word_idf_))

    """
    if not ut.QUIET:
        print('[smk_index.idf] +--- Start Compute IDF')
    if ut.VERBOSE or verbose:
        print('[smk_index.idf] Word IDFs: ')

    idxs_list, aids_list = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)

    # TODO: Integrate different idf measures
    if vocab_weighting == 'idf':
        idf_list = compute_idf_orig(aids_list, daids)
    elif vocab_weighting == 'negentropy':
        assert daid2_label is not None
        idf_list = compute_idf_label1(aids_list, daid2_label)
    else:
        raise AssertionError('unknown option vocab_weighting=%r' % vocab_weighting)
    if ut.VERBOSE or verbose:
        print('[smk_index.idf] L___ End Compute IDF')
    wx2_idf = dict(zip(wx_series, idf_list))
    return wx2_idf


@profile
[docs]def helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series):
    """ helper function """
    # idxs for each word
    idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE)
                 if wx in wx2_idxs
                 else np.empty(0, dtype=hstypes.INDEX_TYPE)
                 for wx in wx_series]
    # aids for each word
    aids_list = [idx2_aid.take(idxs)
                 if len(idxs) > 0
                 else np.empty(0, dtype=hstypes.INDEX_TYPE)
                 for idxs in idxs_list]
    return idxs_list, aids_list


@profile
[docs]def compute_idf_orig(aids_list, daids):
    """
    The standard tried and true idf measure
    """
    nTotalDocs = len(daids)
    # idf denominator
    nDocsWithWord_list = np.array([len(set(aids)) for aids in aids_list])
    # Typically for IDF, 1 is added to the denominator to prevent divide by 0
    # compute idf half of sccw-idf weighting
    idf_list = np.log(np.divide(nTotalDocs, np.add(nDocsWithWord_list, 1),
                                dtype=hstypes.FLOAT_TYPE), dtype=hstypes.FLOAT_TYPE)
    return idf_list


@profile
[docs]def compute_negentropy_names(aids_list, daid2_label):
    r"""
    One of our idf extensions
    Word weighting based on the negative entropy over all names of p(n_i | word)

    Args:
        aids_list (list of aids):
        daid2_label (dict from daid to label):

    Returns:
        negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _

    Math::
        p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word)

        p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)}

        p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl')

        p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel =
        \frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}}

        h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word)

        word_weight = log(N) - h(n | word)

    CommandLine:
        python dev.py -t smk2 --allgt --db GZ_ALL
        python dev.py -t smk5 --allgt --db GZ_ALL

    Auto:
        python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')"
    """
    nWords = len(aids_list)
    # --- LABEL MEMBERS w.r.t daids ---
    # compute mapping from label to daids
    # Translate tuples into scalars for efficiency
    label_list = list(daid2_label.values())
    lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]

    # --- DAID MEMBERS w.r.t. words ---
    # compute mapping from daid to word indexes
    # finds all the words that belong to an annotation
    daid2_wxs = ut.ddict(list)
    for wx, _daids in enumerate(aids_list):
        for daid in _daids:
            daid2_wxs[daid].append(wx)

    # --- \Pr(\word \given \lbl) for each label ---
    # Compute the number of annotations in a label with the word vs
    # the number of annotations in the label
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    # Get num times word appears for each label
    probWordGivenLabel_list = []
    for lblindex, _daids in lblindex2_daids:
        nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32)
        for daid in _daids:
            wxs = np.unique(daid2_wxs[daid])
            nAnnotOfLabelWithWord[wxs] += 1
        probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids)
        probWordGivenLabel_list.append(probWordGivenLabel)
    # (nLabels, nWords)
    probWordGivenLabel_arr = np.array(probWordGivenLabel_list)
    # --- \Pr(\lbl \given \word) ---
    # compute partition function that approximates probability of a word
    # (1, nWords)
    probWord = probWordGivenLabel_arr.sum(axis=0)
    probWord.shape = (1, probWord.size)
    # (nLabels, nWords)
    probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord)
    # --- \Pr(\name \given \lbl) ---
    # get names for each unique label
    nid_list = np.array([label_list[xs[0]][0] for xs in groupxs])
    unique_nids, groupxs_ = clustertool.group_indices(nid_list)
    # (nNames, nWords)
    # add a little wiggle room
    eps = 1E-9
    # http://stackoverflow.com/questions/872544/precision-of-floating-point
    #epsilon = 2^(E-52)    % For a 64-bit float (double precision)
    #epsilon = 2^(E-23)    % For a 32-bit float (single precision)
    #epsilon = 2^(E-10)    % For a 16-bit float (half precision)
    probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_])
    logProbNameGivenWord = np.log(probNameGivenWord)
    wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0)
    # Compute negative entropy for weights
    nNames = len(nid_list)
    negentropy_list = np.log(nNames) - wordNameEntropy
    return negentropy_list


@profile
[docs]def compute_idf_label1(aids_list, daid2_label):
    """
    One of our idf extensions

    Example:
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> wx_series = np.arange(len(invindex.words))
        >>> idx2_aid = invindex.idx2_daid
        >>> daid2_label = invindex.daid2_label
        >>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
        >>> idxs_list, aids_list = _
        >>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids)
    """
    nWords = len(aids_list)
    # Computes our novel label idf weight
    lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values()))
    #daid2_lblindex = dict(zip(daid_list, lblindex_list))
    unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
    daid_list = np.array(daid2_label.keys())
    daids_list = [daid_list.take(xs) for xs in groupxs]
    daid2_wxs = ut.ddict(list)
    for wx, daids in enumerate(aids_list):
        for daid in daids:
            daid2_wxs[daid].append(wx)
    lblindex2_daids = list(zip(unique_lblindexes, daids_list))
    nLabels = len(unique_lblindexes)
    pcntLblsWithWord = np.zeros(nWords, np.float64)
    # Get num times word appears for eachlabel
    for lblindex, daids in lblindex2_daids:
        nWordsWithLabel = np.zeros(nWords)
        for daid in daids:
            wxs = daid2_wxs[daid]
            nWordsWithLabel[wxs] += 1
        pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids))

    # Labels for each word
    idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1),
                                dtype=hstypes.FLOAT_TYPE),
                      dtype=hstypes.FLOAT_TYPE)
    return idf_list


#@ut.cached_func('smk_rvecs_', appname='smk')
@profile
[docs]def compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid,
                       idx2_fx, aggregate, verbose=False):
    """
    Computes residual vectors based on word assignments
    returns mapping from word index to a set of residual vectors

    Args:
        words (ndarray):
        wx2_idxs (dict):
        wx2_maws (dict):
        idx2_vec (dict):
        idx2_aid (dict):
        idx2_fx (dict):
        aggregate (bool):
        verbose (bool):

    Returns:
        tuple : (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws) formatted as::
            * wx2_rvecs - [ ... [ rvec_i1, ...,  rvec_Mi ]_i ... ]
            * wx2_aids  - [ ... [  aid_i1, ...,   aid_Mi ]_i ... ]
            * wx2_fxs   - [ ... [[fxs]_i1, ..., [fxs]_Mi ]_i ... ]

        For every word::

            * list of aggvecs
            * For every aggvec:
                * one parent aid, if aggregate is False: assert isunique(aids)
                * list of parent fxs, if aggregate is True: assert len(fxs) == 1

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
        >>> words     = invindex.words
        >>> idx2_aid  = invindex.idx2_daid
        >>> idx2_fx   = invindex.idx2_dfx
        >>> idx2_vec  = invindex.idx2_dvec
        >>> aggregate = ibs.cfg.query_cfg.smk_cfg.aggregate
        >>> wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags = compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid, idx2_fx, aggregate)
    """
    if not ut.QUIET:
        print('[smk_index.rvec] +--- Start Compute Residuals')

    wx_sublist = np.array(wx2_idxs.keys())
    # Build lists w.r.t. words

    idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE) for wx in wx_sublist]
    aids_list = [idx2_aid.take(idxs) for idxs in idxs_list]
    if ut.DEBUG2:
        #assert np.all(np.diff(wx_sublist) == 1), 'not dense'
        assert all([len(a) == len(b) for a, b in zip(idxs_list, aids_list)]), 'bad alignment'
        assert idx2_vec.shape[0] == idx2_fx.shape[0]
        assert idx2_vec.shape[0] == idx2_aid.shape[0]
    # Prealloc output
    if ut.VERBOSE or verbose:
        lbl = '[smk_index.rvec] agg rvecs' if aggregate else '[smk_index.rvec] nonagg rvecs'
        print(lbl)
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2_idxs(wx2_idxs, len(words))
    # Compute Residuals
    rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)

    if ut.VERBOSE:
        print('Computed size(rvecs_list) = %r' % ut.get_object_size_str(rvecs_list))
        print('Computed size(flags_list) = %r' % ut.get_object_size_str(flags_list))
    if aggregate:
        maws_list = [wx2_maws[wx] for wx in wx_sublist]
        # Aggregate Residuals
        tup = smk_residuals.compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
        (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list) = tup
        # Pack into common query structure
        aggfxs_list = [[idx2_fx.take(idxs) for idxs in aggidxs] for aggidxs in aggidxs_list]
        wx2_aggvecs  = dict(zip(wx_sublist, aggvecs_list))
        wx2_aggaids  = dict(zip(wx_sublist, aggaids_list))
        wx2_aggfxs   = dict(zip(wx_sublist, aggfxs_list))
        wx2_aggmaws  = dict(zip(wx_sublist, aggmaws_list))
        wx2_aggflags = dict(zip(wx_sublist, aggflags_list))
        (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags) = (
            wx2_aggvecs, wx2_aggaids, wx2_aggfxs, wx2_aggmaws, wx2_aggflags)
    else:
        # Hack non-aggregate residuals to have the same structure as aggregate
        # residuals for compatability: i.e. each rvec gets a list of fxs that
        # contributed to it, and for SMK this is a list of size 1
        fxs_list  = [[idx2_fx[idx:idx + 1] for idx in idxs]  for idxs in idxs_list]
        wx2_rvecs = dict(zip(wx_sublist, rvecs_list))
        wx2_aids  = dict(zip(wx_sublist, aids_list))
        wx2_fxs   = dict(zip(wx_sublist, fxs_list))
        wx2_flags = dict(zip(wx_sublist, flags_list))
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(words, wx2_rvecs, wx2_aids, wx2_fxs)
    if ut.VERBOSE or verbose:
        print('[smk_index.rvec] L___ End Compute Residuals')
    return wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags


#@ut.cached_func('sccw', appname='smk', key_argx=[1, 2])
@profile
[docs]def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf,
                       wx2_dmaws, smk_alpha, smk_thresh, verbose=False):
    """
    Computes sccw normalization scalar for the database annotations.
    This is gamma from the SMK paper.
    sccw is a self consistency critiron weight --- a scalar which ensures
    the score of K(X, X) = 1

    Args:
        idx2_daid ():
        wx2_drvecs ():
        wx2_aids ():
        wx2_idf ():
        wx2_dmaws ():
        smk_alpha ():
        smk_thresh ():

    Returns:
        daid2_sccw

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_index import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_index
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1')
        >>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST')
        >>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup
        >>> wx2_dflags = invindex.wx2_dflags
        >>> ws2_idxs = invindex.wx2_idxs
        >>> wx2_dmaws  = invindex.wx2_dmaws
        >>> idx2_daid  = invindex.idx2_daid
        >>> daids      = invindex.daids
        >>> smk_alpha  = qparams.smk_alpha
        >>> smk_thresh = qparams.smk_thresh
        >>> wx2_idf    = wx2_idf
        >>> verbose = True
        >>> invindex.invindex_dbgstr()
        >>> invindex.report_memory()
        >>> invindex.report_memsize()
        >>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose)
    """

    #for wx in wx_sublist:
    #    print(len(wx2_dmaws

    verbose_ = ut.VERBOSE or verbose

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
    if not ut.QUIET:
        print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
    if verbose_:
        print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))

    # Group by daids first and then by word index
    # Get list of aids and rvecs w.r.t. words (ie one item per word)
    wx_sublist = np.array(list(wx2_drvecs.keys()))
    aids_perword  = [wx2_aids[wx] for wx in wx_sublist]

    # wx_list1: Lays out word indexes for each annotation
    # tx_list1: Temporary within annotation subindex + wx uniquely identifies
    # item in wx2_drvecs, wx2_dflags, and wx2_dmaws

    # Flatten out indexes to perform grouping
    flat_aids = np.hstack(aids_perword)
    count = len(flat_aids)
    txs_perword = [np.arange(aids.size) for aids in aids_perword]
    flat_txs  = np.hstack(txs_perword)
    # fromiter is faster for flat_wxs because is not a list of numpy arrays
    wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword))
    flat_wxs  = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count)

    # Group flat indexes by annotation id
    unique_aids, annot_groupxs = clustertool.group_indices(flat_aids)

    # Wxs and Txs grouped by annotation id
    wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs)
    txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs)

    # Group by word inside each annotation group
    wxsubgrouping_perannot = [clustertool.group_indices(wxs)
                              for wxs in wxs_perannot]
    word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot)
    txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs)
                            for txs, groupxs in
                            zip(txs_perannot, word_groupxs_perannot)]
    wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot]

    # Group relavent data for sccw measure by word for each annotation grouping

    def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot):
        return [[wx2_arr[wx].take(txs, axis=0)
                 for wx, txs in zip(wx_perword_, txs_perword_)]
                for wx_perword_, txs_perword_ in
                zip(wxs_perword_perannot, txs_perword_perannot)]

    def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot):
        return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot]

    subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot)
    subgrouped_dmaws  = _vector_subgroup_by_wx(wx2_dmaws,  wxs_perword_perannot, txs_perword_perannot)
    # If we aren't using dmaws replace it with an infinite None iterator
    #subgrouped_dmaws  = iter(lambda: None, 1)
    subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot)
    #subgrouped_dflags  = iter(lambda: None, 1)
    subgrouped_idfs   = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot)

    if verbose_:
        progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
                                   total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME)
    else:
        progiter = ut.identity

    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs)

    sccw_list = [
        smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh)
        for rvecs_list, flags_list, maws_list, idf_list in
        progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs))
    ]
    daid2_sccw = dict(zip(unique_aids, sccw_list))

    if verbose_:
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')

    return daid2_sccw


@profile
[docs]def OLD_compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_aids, wx2_idf, wx2_dmaws,
                           smk_alpha, smk_thresh, verbose=False):
    """
    """
    if ut.DEBUG2:
        from ibeis.algo.hots.smk import smk_debug
        smk_debug.rrr()
        smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)

    with ut.Timer('timer_orig1'):
        wx_sublist = np.array(wx2_drvecs.keys())
        if not ut.QUIET:
            print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
        if ut.VERBOSE or verbose:
            print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
        # Get list of aids and rvecs w.r.t. words
        aids_list   = [wx2_aids[wx] for wx in wx_sublist]
        rvecs_list1 = [wx2_drvecs[wx] for wx in wx_sublist]
        maws_list   = [wx2_dmaws[wx] for wx in wx_sublist]
        if ut.DEBUG2:
            from ibeis.algo.hots.smk import smk_debug
            smk_debug.assert_single_assigned_maws(maws_list)
        # Group by daids first and then by word index
        daid2_wx2_drvecs = clustertool.double_group(wx_sublist, aids_list, rvecs_list1)

        # For every daid, compute its sccw using pregrouped rvecs
        # Summation over words for each aid
        if ut.VERBOSE or verbose:
            print('[smk_index.sccw] SCCW Sum (over daid): ')
        # Get lists w.r.t daids
        aid_list = list(daid2_wx2_drvecs.keys())
        # list of mappings from words to rvecs foreach daid
        # [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,]
        _wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
        _aidwxs_iter   = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list)
        aidrvecs_list  = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list]
        aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter]

    with ut.Timer('timer_orig2'):
        if ut.DEBUG2:
            from ibeis.algo.hots.smk import smk_debug
            smk_debug.check_data_smksumm(aididf_list, aidrvecs_list)
        # TODO: implement database side soft-assign
        sccw_list = [smk_scoring.sccw_summation(rvecs_list, None, idf_list, None, smk_alpha, smk_thresh)
                     for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)]

        daid2_sccw = dict(zip(aid_list, sccw_list))
    if ut.VERBOSE or verbose:
        print('[smk_index.sccw] L___ End Compute Data SCCW\n')
    return daid2_sccw