Source code for ibeis.algo.hots.smk.smk_residuals

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
import utool as ut
import numpy as np
import vtool as vt
from vtool import clustering2 as clustertool
from ibeis.algo.hots import hstypes
(print, rrr, profile) = ut.inject2(__name__, '[smk_residuals]')


#@ut.cached_func('nonagg_rvecs', appname='smk_cachedir', key_argx=[1, 3, 4])
@profile
[docs]def compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list):
    """
    Driver function for nonagg residual computation

    Args:
        words (ndarray): array of words
        idx2_vec (dict): stacked vectors
        wx_sublist (list): words of interest
        idxs_list (list): list of idxs grouped by wx_sublist

    Returns:
        tuple : (rvecs_list, flags_list)

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compute_nonagg_rvecs:0
        python -m ibeis.algo.hots.smk.smk_residuals --test-compute_nonagg_rvecs:1

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> from ibeis.algo.hots.smk import smk_residuals
        >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec()
        >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
        >>> print('Computed size(rvecs_list) = %r' % ut.get_object_size_str(rvecs_list))
        >>> print('Computed size(flags_list) = %r' % ut.get_object_size_str(flags_list))

    Example2:
        >>> # ENABLE_DOCTEST
        >>> # The case where vecs == words
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> vecs = (hstypes.VEC_MAX * rng.rand(4, 128)).astype(hstypes.VEC_TYPE)
        >>> word = vecs[1]
        >>> words = word.reshape(1, 128)
        >>> idx2_vec = vecs
        >>> idxs_list = [np.arange(len(vecs), dtype=np.int32)]
        >>> wx_sublist = [0]
        >>> rvecs_list, flags_list = compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
        >>> rvecs = rvecs_list[0]
        >>> flags = flags_list[0]
        >>> maws  = (np.ones(rvecs.shape[0])).astype(hstypes.FLOAT_TYPE)
        >>> maws_list = np.array([maws])
        >>> aids_list = np.array([np.arange(len(vecs))])

    Timeit:
        %timeit [~np.any(vecs, axis=1) for vecs in vecs_list]
        %timeit [vecs.sum(axis=1) == 0 for vecs in vecs_list]

    """
    # Pick out corresonding lists of residuals and words
    words_list = [words[wx:wx + 1] for wx in wx_sublist]
    vecs_list  = [idx2_vec.take(idxs, axis=0) for idxs in idxs_list]
    # Compute nonaggregated normalized residuals
    rvecs_list = [get_norm_residuals(vecs, word)
                  for vecs, word in zip(vecs_list, words_list)]
    # Extract flags (rvecs which are all zeros) and rvecs
    flags_list = [~np.any(rvecs, axis=1) for rvecs in rvecs_list]
    return rvecs_list, flags_list


@profile
[docs]def compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list):
    """
    Driver function for agg residual computation

    Sums and normalizes all rvecs that belong to the same word and the same
    annotation id

    Args:
        rvecs_list (list): residual vectors grouped by word
        idxs_list (list): stacked descriptor indexes grouped by word
        aids_list (list): annotation rowid for each stacked descriptor index
        maws_list (list): multi assign weights

    Returns:
        tuple : (aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list)

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compute_agg_rvecs

    Example:
        >>> # SLOW_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> from ibeis.algo.hots.smk import smk_residuals
        >>> words, wx_sublist, aids_list, idxs_list, idx2_vec, maws_list = smk_debug.testdata_nonagg_rvec()
        >>> rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
        >>> tup = compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
        >>> aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list = tup
        >>> ut.assert_eq(len(wx_sublist), len(rvecs_list))

    """
    #assert len(idxs_list) == len(rvecs_list)
    # group members of each word by aid, we will collapse these groups
    grouptup_list = [clustertool.group_indices(aids) for aids in aids_list]
    # Agg aids
    aggaids_list = [tup[0] for tup in grouptup_list]
    groupxs_list = [tup[1] for tup in grouptup_list]
    # Aggregate vecs that belong to the same aid, for each word
    # (weighted aggregation with multi-assign-weights)
    aggvecs_list = [
        np.vstack([aggregate_rvecs(rvecs.take(xs, axis=0), maws.take(xs)) for xs in groupxs])
        if len(groupxs) > 0 else
        np.empty((0, hstypes.VEC_DIM), dtype=hstypes.FLOAT_TYPE)
        for rvecs, maws, groupxs in zip(rvecs_list, maws_list, groupxs_list)]
    # Agg idxs
    aggidxs_list = [[idxs.take(xs) for xs in groupxs]
                    for idxs, groupxs in zip(idxs_list, groupxs_list)]
    aggmaws_list = [np.array([maws.take(xs).prod() for xs in groupxs])
                    for maws, groupxs in zip(maws_list, groupxs_list)]
    # Need to recompute flags for consistency
    # flag is true when aggvec is all zeros
    aggflags_list = [~np.any(aggvecs, axis=1) for aggvecs in aggvecs_list]
    return aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list


@profile
[docs]def compress_normvec_float16(arr_float):
    """
    CURRENTLY THIS IS NOT USED. WE ARE WORKING WITH INT8 INSTEAD

    compresses 8 or 4 bytes of information into 2 bytes
    Assumes RVEC_TYPE is float16

    Args:
        arr_float (ndarray):

    Returns:
        ndarray[dtype=np.float16]

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compress_normvec_float16

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> rng = np.random.RandomState(0)
        >>> arr_float = smk_debug.get_test_float_norm_rvecs(2, 5, rng=rng)
        >>> vt.normalize_rows(arr_float, out=arr_float)
        >>> arr_float16 = compress_normvec_float16(arr_float)
        >>> result = ut.numpy_str(arr_float16, precision=4)
        >>> print(result)
        np.array([[ 0.4941,  0.1121,  0.2742,  0.6279,  0.5234],
                  [-0.6812,  0.6621, -0.1055, -0.0719,  0.2861]], dtype=np.float16)
    """
    return arr_float.astype(np.float16)


@profile
[docs]def compress_normvec_uint8(arr_float):
    """
    compresses 8 or 4 bytes of information into 1 byte
    Assumes RVEC_TYPE is int8

    Takes a normalized float vectors in range -1 to 1 with l2norm=1 and
    compresses them into 1 byte. Takes advantage of the fact that
    rarely will a component of a vector be greater than 64, so we can extend the
    range to double what normally would be allowed. This does mean there is a
    slight (but hopefully negligable) information loss. It will be negligable
    when nDims=128, when it is lower, you may want to use a different function.

    Args:
        arr_float (ndarray): normalized residual vector of type float in range -1 to 1 (with l2 norm of 1)

    Returns:
        (ndarray): residual vector of type int8 in range -128 to 128

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-compress_normvec_uint8

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> from ibeis.algo.hots.smk import smk_debug
        >>> rng = np.random.RandomState(0)
        >>> arr_float = smk_debug.get_test_float_norm_rvecs(2, 5, rng=rng)
        >>> vt.normalize_rows(arr_float, out=arr_float)
        >>> arr_int8 = compress_normvec_uint8(arr_float)
        >>> result = arr_int8
        >>> print(result)
        [[ 126   29   70  127  127]
         [-127  127  -27  -18   73]]
    """
    # Trick / hack: use 2 * max (psuedo_max), and clip because most components
    # will be less than 2 * max. This will reduce quantization error
    # rvec_max = 128
    # rvec_pseudo_max = rvec_max * 2 = 256
    # TODO: not sure if rounding or floor is the correct operation
    return np.clip(np.round(arr_float * 255.0), -127, 127).astype(np.int8)
    #return np.clip(np.round((arr_float * (hstypes.RVEC_PSEUDO_MAX))),
    #               hstypes.RVEC_MIN, hstypes.RVEC_MAX).astype(np.int8)


# Choose appropriate compression function based on the RVEC_TYPE
# currently its np.int8
if hstypes.RVEC_TYPE == np.float16:
    compress_normvec = compress_normvec_float16
elif hstypes.RVEC_TYPE == np.int8:
    compress_normvec = compress_normvec_uint8
else:
    raise AssertionError('unsupported RVEC_TYPE = %r' % hstypes.RVEC_TYPE)


@profile
[docs]def aggregate_rvecs(rvecs, maws):
    r"""
    helper for compute_agg_rvecs

    Args:
        rvecs (ndarray): residual vectors
        maws (ndarray): multi assign weights

    Returns:
        rvecs_agg : aggregated residual vectors

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-aggregate_rvecs
        ./run_tests.py --exclude-doctest-patterns pipeline neighbor score coverage automated_helpers name automatch chip_match multi_index automated special_query scoring automated nn_weights distinctive match_chips4 query_request devcases hstypes params ibsfuncs smk_core, smk_debug control

    Example:
        >>> # ENABLE_DOCTEST
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> rvecs = (hstypes.RVEC_MAX * rng.rand(4, 128)).astype(hstypes.RVEC_TYPE)
        >>> maws  = (rng.rand(rvecs.shape[0])).astype(hstypes.FLOAT_TYPE)
        >>> rvecs_agg = aggregate_rvecs(rvecs, maws)
        >>> result = ut.numpy_str2(rvecs_agg, linewidth=70)
        >>> print(result)
        np.array([[28, 27, 32, 16, 16, 16, 12, 31, 27, 29, 19, 27, 21, 24, 15,
                   21, 17, 37, 13, 40, 38, 33, 17, 30, 13, 23,  9, 25, 19, 15,
                   20, 17, 19, 18, 13, 25, 37, 29, 21, 16, 20, 21, 34, 11, 28,
                   19, 17, 12, 14, 24, 21, 11, 27, 11, 24, 10, 23, 20, 28, 12,
                   16, 14, 30, 22, 18, 26, 21, 20, 18,  9, 29, 20, 25, 19, 23,
                   20,  7, 13, 22, 22, 15, 20, 22, 16, 27, 10, 16, 20, 25, 25,
                   26, 28, 22, 38, 24, 16, 14, 19, 24, 14, 22, 19, 19, 33, 21,
                   22, 18, 22, 25, 25, 22, 23, 32, 16, 25, 15, 29, 21, 25, 20,
                   22, 31, 29, 24, 24, 25, 20, 14]], dtype=np.int8)

    """
    if rvecs.shape[0] == 1:
        return rvecs
    # Prealloc sum output (do not assign the result of sum)
    arr_float = np.empty((1, rvecs.shape[1]), dtype=hstypes.FLOAT_TYPE)
    # Take weighted average of multi-assigned vectors
    (maws[:, np.newaxis] * rvecs.astype(hstypes.FLOAT_TYPE)).sum(axis=0, out=arr_float[0])
    # Jegou uses mean instead. Sum should be fine because we normalize
    #rvecs.mean(axis=0, out=rvecs_agg[0])
    vt.normalize_rows(arr_float, out=arr_float)
    rvecs_agg = compress_normvec(arr_float)
    return rvecs_agg


@profile
[docs]def get_norm_residuals(vecs, word):
    """
    computes normalized residuals of vectors with respect to a word

    Args:
        vecs (ndarray):
        word (ndarray):

    Returns:
        tuple : (rvecs_n, rvec_flag)

    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals --test-get_norm_residuals

    Example:
        >>> # ENABLE_DOCTEST
        >>> # The case where vecs != words
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> vecs = (hstypes.VEC_MAX * rng.rand(4, 128)).astype(hstypes.VEC_TYPE)
        >>> word = (hstypes.VEC_MAX * rng.rand(1, 128)).astype(hstypes.VEC_TYPE)
        >>> rvecs_n = get_norm_residuals(vecs, word)
        >>> result = ut.numpy_str2(rvecs_n)
        >>> print(result)

    Example:
        >>> # ENABLE_DOCTEST
        >>> # The case where vecs == words
        >>> from ibeis.algo.hots.smk.smk_residuals import *  # NOQA
        >>> rng = np.random.RandomState(0)
        >>> vecs = (hstypes.VEC_MAX * rng.rand(4, 128)).astype(hstypes.VEC_TYPE)
        >>> word = vecs[1]
        >>> rvecs_n = get_norm_residuals(vecs, word)
        >>> result = ut.numpy_str2(rvecs_n)
        >>> print(result)

    IGNORE
        rvecs_agg8 = compress_normvec_uint8(arr_float)
        rvecs_agg16 = compress_normvec_float16(arr_float)
        ut.print_object_size(rvecs_agg16, 'rvecs_agg16: ')
        ut.print_object_size(rvecs_agg8,  'rvecs_agg8:  ')
        ut.print_object_size(rvec_flag,   'rvec_flag:   ')

        %timeit np.isnan(_rvec_sums)
        %timeit  _rvec_sums == 0
        %timeit  np.equal(rvec_sums, 0)
        %timeit  rvec_sums == 0
        %timeit  np.logical_or(np.isnan(_rvec_sums), _rvec_sums == 0)
    """
    # Compute residuals of assigned vectors
    #rvecs_n = word.astype(dtype=FLOAT_TYPE) - vecs.astype(dtype=FLOAT_TYPE)
    arr_float = np.subtract(word.astype(hstypes.FLOAT_TYPE), vecs.astype(hstypes.FLOAT_TYPE))
    # Faster, but doesnt work with np.norm
    #rvecs_n = np.subtract(word.view(hstypes.FLOAT_TYPE), vecs.view(hstypes.FLOAT_TYPE))
    vt.normalize_rows(arr_float, out=arr_float)
    # Mark null residuals
    #_rvec_sums = arr_float.sum(axis=1)
    #rvec_flag = np.isnan(_rvec_sums)
    # Converts normvec to a smaller type like float16 or int8
    rvecs_n = compress_normvec(arr_float)
    # IF FLOAT16 WE NEED TO FILL NANS
    # (but we should use int8, and in that case it is implicit)
    # rvecs_n = np.nan_to_num(rvecs_n)
    return rvecs_n


if __name__ == '__main__':
    """
    CommandLine:
        python -m ibeis.algo.hots.smk.smk_residuals
        python -m ibeis.algo.hots.smk.smk_residuals --allexamples
        python -m ibeis.algo.hots.smk.smk_residuals --allexamples --noface --nosrc
    """
    import multiprocessing
    multiprocessing.freeze_support()  # for win32
    import utool as ut  # NOQA
    ut.doctest_funcs()