# -*- coding: utf-8 -*-
"""
smk_index
This module contains functions for the SelectiveMatchKernels's inverted index.
TODO::
* Test suit 1000k images
* Extend for SMK with labels
* Test get numbers and refine
* Extrnal keypoint specific weighting
"""
from __future__ import absolute_import, division, print_function
#import six
import utool # NOQA
import utool as ut
#import weakref
import numpy as np
import six # NOQA
import pyflann
#import pandas as pd
from six.moves import zip, map, range # NOQA
from vtool import clustering2 as clustertool
from ibeis.algo.hots import hstypes
from ibeis.algo.hots.smk import smk_scoring
from ibeis.algo.hots.smk import smk_residuals
(print, print_, printDBG, rrr, profile) = ut.inject(__name__, '[smk_index]')
USE_CACHE_WORDS = not ut.get_argflag('--nocache-words')
WITH_TOTALTIME = True
#@ut.memprof
@profile
[docs]def learn_visual_words(ibs, config2_=None, use_cache=USE_CACHE_WORDS, memtrack=None):
"""
Computes and caches visual words
Args:
ibs (?):
qreq_ (QueryRequest): query request object with hyper-parameters
use_cache (bool): turns on disk based caching(default = True)
memtrack (None): (default = None)
Returns:
ndarray[uint8_t, ndim=2]: words - aggregate descriptor cluster centers
Returns:
words
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, taids, daids, qaids, qreq_, nWords = smk_debug.testdata_dataframe()
>>> use_cache = True
>>> words = learn_visual_words(ibs, qreq_)
>>> print(words.shape)
(8000, 128)
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> import ibeis
>>> ibs = ibeis.opendb('PZ_Master1')
>>> config2_ = ibs.new_query_params(cfgdict=dict(nWords=128000))
>>> use_cache = True
>>> words = learn_visual_words(ibs, config2_)
>>> print(words.shape)
(8000, 128)
Auto:
from ibeis.algo.hots.smk import smk_index
import utool as ut
argdoc = ut.make_default_docstr(smk_index.learn_visual_words)
print(argdoc)
"""
#if memtrack is None:
# memtrack = ut.MemoryTracker('[learn_visual_words]')
#config2_ = qreq_.get_external_data_config2()
nWords = config2_.nWords
# TODO: Incorporated taids (vocab training ids) into qreq
if config2_.vocab_taids == 'all':
taids = ibs.get_valid_aids(species=ibs.get_primary_database_species()) # exemplar
else:
taids = config2_.vocab_taids
initmethod = config2_.vocab_init_method
max_iters = config2_.vocab_nIters
flann_params = config2_.vocab_flann_params
train_vecs_list = ibs.get_annot_vecs(taids, eager=True, config2_=config2_)
#memtrack.track_obj(train_vecs_list[0], 'train_vecs_list[0]')
#memtrack.report('loaded trainvecs')
train_vecs = np.vstack(train_vecs_list)
#memtrack.track_obj(train_vecs, 'train_vecs')
#memtrack.report('stacked trainvecs')
del train_vecs_list
print('[smk_index] Train Vocab(nWords=%d) using %d annots and %d descriptors' %
(nWords, len(taids), len(train_vecs)))
kwds = dict(max_iters=max_iters, use_cache=use_cache,
initmethod=initmethod, appname='smk',
flann_params=flann_params)
words = clustertool.cached_akmeans(train_vecs, nWords, **kwds)
del train_vecs
del kwds
#memtrack.report('returning words')
#del train_vecs_list
return words
@profile
[docs]def assign_to_words_(wordflann, words, idx2_vec, nAssign, massign_alpha,
massign_sigma, massign_equal_weights):
"""
Assigns descriptor-vectors to nearest word.
Args:
wordflann (FLANN): nearest neighbor index over words
words (ndarray): vocabulary words
idx2_vec (ndarray): descriptors to assign
nAssign (int): number of words to assign each descriptor to
massign_alpha (float): multiple-assignment ratio threshold
massign_sigma (float): multiple-assignment gaussian variance
massign_equal_weights (bool): assign equal weight to all multiassigned words
Returns:
tuple: inverted index, multi-assigned weights, and forward index
formated as::
* wx2_idxs - word index -> vector indexes
* wx2_maws - word index -> multi-assignment weights
* idf2_wxs - vector index -> assigned word indexes
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, daids, qaids, invindex, qreq_ = smk_debug.testdata_raw_internals0()
>>> words = invindex.words
>>> wordflann = invindex.wordflann
>>> idx2_vec = invindex.idx2_dvec
>>> nAssign = qreq_.qparams.nAssign
>>> massign_alpha = qreq_.qparams.massign_alpha
>>> massign_sigma = qreq_.qparams.massign_sigma
>>> massign_equal_weights = qreq_.qparams.massign_equal_weights
>>> _dbargs = (wordflann, words, idx2_vec, nAssign, massign_alpha, massign_sigma, massign_equal_weights)
>>> wx2_idxs, wx2_maws, idx2_wxs = assign_to_words_(*_dbargs)
"""
if ut.VERBOSE:
print('[smk_index.assign] +--- Start Assign vecs to words.')
print('[smk_index.assign] * nAssign=%r' % nAssign)
if not ut.QUIET:
print('[smk_index.assign] assign_to_words_. len(idx2_vec) = %r' % len(idx2_vec))
# Assign each vector to the nearest visual words
assert nAssign > 0, 'cannot assign to 0 neighbors'
try:
_idx2_wx, _idx2_wdist = wordflann.nn_index(idx2_vec, nAssign)
except pyflann.FLANNException as ex:
ut.printex(ex, 'probably misread the cached flann_fpath=%r' % (wordflann.flann_fpath,))
raise
_idx2_wx.shape = (idx2_vec.shape[0], nAssign)
_idx2_wdist.shape = (idx2_vec.shape[0], nAssign)
if nAssign > 1:
idx2_wxs, idx2_maws = compute_multiassign_weights_(
_idx2_wx, _idx2_wdist, massign_alpha, massign_sigma, massign_equal_weights)
else:
idx2_wxs = _idx2_wx.tolist()
idx2_maws = [[1.0]] * len(idx2_wxs)
# Invert mapping -- Group by word indexes
jagged_idxs = ([idx] * len(wxs)for idx, wxs in enumerate(idx2_wxs))
wx_keys, groupxs = clustertool.jagged_group(idx2_wxs)
idxs_list = clustertool.apply_jagged_grouping(jagged_idxs, groupxs)
maws_list = clustertool.apply_jagged_grouping(idx2_maws, groupxs)
wx2_idxs = dict(zip(wx_keys, idxs_list))
wx2_maws = dict(zip(wx_keys, maws_list))
if ut.VERBOSE:
print('[smk_index.assign] L___ End Assign vecs to words.')
return wx2_idxs, wx2_maws, idx2_wxs
@profile
[docs]def compute_multiassign_weights_(_idx2_wx, _idx2_wdist, massign_alpha,
massign_sigma, massign_equal_weights):
"""
Multi Assignment Filtering from Improving Bag of Features
Args:
_idx2_wx ():
_idx2_wdist ():
massign_alpha ():
massign_sigma ():
massign_equal_weights (): Turns off soft weighting. Gives all assigned
vectors weight 1
Returns:
tuple : (idx2_wxs, idx2_maws)
References:
(Improving Bag of Features)
http://lear.inrialpes.fr/pubs/2010/JDS10a/jegou_improvingbof_preprint.pdf
(Lost in Quantization)
http://www.robots.ox.ac.uk/~vgg/publications/papers/philbin08.ps.gz
(A Context Dissimilarity Measure for Accurate and Efficient Image Search)
https://lear.inrialpes.fr/pubs/2007/JHS07/jegou_cdm.pdf
Notes:
sigma values from \cite{philbin_lost08}
(70 ** 2) ~= 5000,
(80 ** 2) ~= 6250,
(86 ** 2) ~= 7500,
Auto:
from ibeis.algo.hots.smk import smk_index
import utool as ut; print(ut.make_default_docstr(smk_index.compute_multiassign_weights_))
"""
if not ut.QUIET:
print('[smk_index.assign] compute_multiassign_weights_')
# Valid word assignments are beyond fraction of distance to the nearest word
massign_thresh = _idx2_wdist.T[0:1].T.copy()
# HACK: If the nearest word has distance 0 then this threshold is too hard
# so we should use the distance to the second nearest word.
flag_too_close = (massign_thresh == 0)
massign_thresh[flag_too_close] = _idx2_wdist.T[1:2].T[flag_too_close]
# Compute the threshold fraction
np.add(.001, massign_thresh, out=massign_thresh)
np.multiply(massign_alpha, massign_thresh, out=massign_thresh)
invalid = np.greater_equal(_idx2_wdist, massign_thresh)
if ut.VERBOSE:
_ = (invalid.size - invalid.sum(), invalid.size)
print('[smk_index.assign] + massign_alpha = %r' % (massign_alpha,))
print('[smk_index.assign] + massign_sigma = %r' % (massign_sigma,))
print('[smk_index.assign] + massign_equal_weights = %r' % (massign_equal_weights,))
print('[smk_index.assign] * Marked %d/%d assignments as invalid' % _)
if massign_equal_weights:
# Performance hack from jegou paper: just give everyone equal weight
masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
idx2_wxs = list(map(ut.filter_Nones, masked_wxs.tolist()))
#ut.embed()
if ut.DEBUG2:
assert all([isinstance(wxs, list) for wxs in idx2_wxs])
idx2_maws = [np.ones(len(wxs), dtype=np.float32) for wxs in idx2_wxs]
else:
# More natural weighting scheme
# Weighting as in Lost in Quantization
gauss_numer = -_idx2_wdist.astype(np.float64)
gauss_denom = 2 * (massign_sigma ** 2)
gauss_exp = np.divide(gauss_numer, gauss_denom)
unnorm_maw = np.exp(gauss_exp)
# Mask invalid multiassignment weights
masked_unorm_maw = np.ma.masked_array(unnorm_maw, mask=invalid)
# Normalize multiassignment weights from 0 to 1
masked_norm = masked_unorm_maw.sum(axis=1)[:, np.newaxis]
masked_maw = np.divide(masked_unorm_maw, masked_norm)
masked_wxs = np.ma.masked_array(_idx2_wx, mask=invalid)
# Remove masked weights and word indexes
idx2_wxs = list(map(ut.filter_Nones, masked_wxs.tolist()))
idx2_maws = list(map(ut.filter_Nones, masked_maw.tolist()))
#with ut.EmbedOnException():
if ut.DEBUG2:
checksum = [sum(maws) for maws in idx2_maws]
for x in np.where([not ut.almost_eq(val, 1) for val in checksum])[0]:
print(checksum[x])
print(_idx2_wx[x])
print(masked_wxs[x])
print(masked_maw[x])
print(massign_thresh[x])
print(_idx2_wdist[x])
#all([ut.almost_eq(x, 1) for x in checksum])
assert all([ut.almost_eq(val, 1) for val in checksum]), 'weights did not break evenly'
return idx2_wxs, idx2_maws
#@ut.cached_func('smk_idf', appname='smk', key_argx=[1, 2, 3], key_kwds=['daid2_label'])
@profile
[docs]def compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids, daid2_label=None,
vocab_weighting='idf', verbose=False):
"""
Computes the inverse-document-frequency weighting for each word
Args:
wx_series ():
wx2_idxs ():
idx2_aid ():
daids ():
daid2_label ():
vocab_weighting ():
Returns:
wx2_idf
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
>>> wx_series = np.arange(len(invindex.words))
>>> idx2_aid = invindex.idx2_daid
>>> daid2_label = invindex.daid2_label
>>> wx2_idf = compute_word_idf_(wx_series, wx2_idxs, idx2_aid, daids)
>>> result = str(len(wx2_idf))
>>> print(result)
8000
Ignore:
#>>> wx2_idxs = invindex.wx2_idxs
Auto:
from ibeis.algo.hots.smk import smk_index
import utool as ut; print(ut.make_default_docstr(smk_index.compute_word_idf_))
"""
if not ut.QUIET:
print('[smk_index.idf] +--- Start Compute IDF')
if ut.VERBOSE or verbose:
print('[smk_index.idf] Word IDFs: ')
idxs_list, aids_list = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
# TODO: Integrate different idf measures
if vocab_weighting == 'idf':
idf_list = compute_idf_orig(aids_list, daids)
elif vocab_weighting == 'negentropy':
assert daid2_label is not None
idf_list = compute_idf_label1(aids_list, daid2_label)
else:
raise AssertionError('unknown option vocab_weighting=%r' % vocab_weighting)
if ut.VERBOSE or verbose:
print('[smk_index.idf] L___ End Compute IDF')
wx2_idf = dict(zip(wx_series, idf_list))
return wx2_idf
@profile
[docs]def helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series):
""" helper function """
# idxs for each word
idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE)
if wx in wx2_idxs
else np.empty(0, dtype=hstypes.INDEX_TYPE)
for wx in wx_series]
# aids for each word
aids_list = [idx2_aid.take(idxs)
if len(idxs) > 0
else np.empty(0, dtype=hstypes.INDEX_TYPE)
for idxs in idxs_list]
return idxs_list, aids_list
@profile
[docs]def compute_idf_orig(aids_list, daids):
"""
The standard tried and true idf measure
"""
nTotalDocs = len(daids)
# idf denominator
nDocsWithWord_list = np.array([len(set(aids)) for aids in aids_list])
# Typically for IDF, 1 is added to the denominator to prevent divide by 0
# compute idf half of sccw-idf weighting
idf_list = np.log(np.divide(nTotalDocs, np.add(nDocsWithWord_list, 1),
dtype=hstypes.FLOAT_TYPE), dtype=hstypes.FLOAT_TYPE)
return idf_list
@profile
[docs]def compute_negentropy_names(aids_list, daid2_label):
r"""
One of our idf extensions
Word weighting based on the negative entropy over all names of p(n_i | word)
Args:
aids_list (list of aids):
daid2_label (dict from daid to label):
Returns:
negentropy_list (ndarray[float32]): idf-like weighting for each word based on the negative entropy
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
>>> wx_series = np.arange(len(invindex.words))
>>> idx2_aid = invindex.idx2_daid
>>> daid2_label = invindex.daid2_label
>>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
>>> idxs_list, aids_list = _
Math::
p(n_i | \word) = \sum_{\lbl \in L_i} p(\lbl | \word)
p(\lbl | \word) = \frac{p(\word | \lbl) p(\lbl)}{p(\word)}
p(\word) = \sum_{\lbl' \in L} p(\word | \lbl') p(\lbl')
p(\word | \lbl) = NumAnnotOfLabelWithWord / NumAnnotWithLabel =
\frac{\sum_{\X \in \DB_\lbl} b(\word, \X)}{\card{\DB_\lbl}}
h(n_i | word) = -\sum_{i=1}^N p(n_i | \word) \log p(n_i | \word)
word_weight = log(N) - h(n | word)
CommandLine:
python dev.py -t smk2 --allgt --db GZ_ALL
python dev.py -t smk5 --allgt --db GZ_ALL
Auto:
python -c "import utool as ut; ut.print_auto_docstr('ibeis.algo.hots.smk.smk_index', 'compute_negentropy_names')"
"""
nWords = len(aids_list)
# --- LABEL MEMBERS w.r.t daids ---
# compute mapping from label to daids
# Translate tuples into scalars for efficiency
label_list = list(daid2_label.values())
lblindex_list = np.array(ut.tuples_to_unique_scalars(label_list))
#daid2_lblindex = dict(zip(daid_list, lblindex_list))
unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
daid_list = np.array(daid2_label.keys())
daids_list = [daid_list.take(xs) for xs in groupxs]
# --- DAID MEMBERS w.r.t. words ---
# compute mapping from daid to word indexes
# finds all the words that belong to an annotation
daid2_wxs = ut.ddict(list)
for wx, _daids in enumerate(aids_list):
for daid in _daids:
daid2_wxs[daid].append(wx)
# --- \Pr(\word \given \lbl) for each label ---
# Compute the number of annotations in a label with the word vs
# the number of annotations in the label
lblindex2_daids = list(zip(unique_lblindexes, daids_list))
# Get num times word appears for each label
probWordGivenLabel_list = []
for lblindex, _daids in lblindex2_daids:
nAnnotOfLabelWithWord = np.zeros(nWords, dtype=np.int32)
for daid in _daids:
wxs = np.unique(daid2_wxs[daid])
nAnnotOfLabelWithWord[wxs] += 1
probWordGivenLabel = nAnnotOfLabelWithWord.astype(np.float64) / len(_daids)
probWordGivenLabel_list.append(probWordGivenLabel)
# (nLabels, nWords)
probWordGivenLabel_arr = np.array(probWordGivenLabel_list)
# --- \Pr(\lbl \given \word) ---
# compute partition function that approximates probability of a word
# (1, nWords)
probWord = probWordGivenLabel_arr.sum(axis=0)
probWord.shape = (1, probWord.size)
# (nLabels, nWords)
probLabelGivenWord_arr = (probWordGivenLabel_arr / probWord)
# --- \Pr(\name \given \lbl) ---
# get names for each unique label
nid_list = np.array([label_list[xs[0]][0] for xs in groupxs])
unique_nids, groupxs_ = clustertool.group_indices(nid_list)
# (nNames, nWords)
# add a little wiggle room
eps = 1E-9
# http://stackoverflow.com/questions/872544/precision-of-floating-point
#epsilon = 2^(E-52) % For a 64-bit float (double precision)
#epsilon = 2^(E-23) % For a 32-bit float (single precision)
#epsilon = 2^(E-10) % For a 16-bit float (half precision)
probNameGivenWord = eps + (1.0 - eps) * np.array([probLabelGivenWord_arr.take(xs, axis=0).sum(axis=0) for xs in groupxs_])
logProbNameGivenWord = np.log(probNameGivenWord)
wordNameEntropy = -(probNameGivenWord * logProbNameGivenWord).sum(0)
# Compute negative entropy for weights
nNames = len(nid_list)
negentropy_list = np.log(nNames) - wordNameEntropy
return negentropy_list
@profile
[docs]def compute_idf_label1(aids_list, daid2_label):
"""
One of our idf extensions
Example:
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
>>> wx_series = np.arange(len(invindex.words))
>>> idx2_aid = invindex.idx2_daid
>>> daid2_label = invindex.daid2_label
>>> _ = helper_idf_wordgroup(wx2_idxs, idx2_aid, wx_series)
>>> idxs_list, aids_list = _
>>> wx2_idf = compute_idf_label1(wx_series, wx2_idxs, idx2_aid, daids)
"""
nWords = len(aids_list)
# Computes our novel label idf weight
lblindex_list = np.array(ut.tuples_to_unique_scalars(daid2_label.values()))
#daid2_lblindex = dict(zip(daid_list, lblindex_list))
unique_lblindexes, groupxs = clustertool.group_indices(lblindex_list)
daid_list = np.array(daid2_label.keys())
daids_list = [daid_list.take(xs) for xs in groupxs]
daid2_wxs = ut.ddict(list)
for wx, daids in enumerate(aids_list):
for daid in daids:
daid2_wxs[daid].append(wx)
lblindex2_daids = list(zip(unique_lblindexes, daids_list))
nLabels = len(unique_lblindexes)
pcntLblsWithWord = np.zeros(nWords, np.float64)
# Get num times word appears for eachlabel
for lblindex, daids in lblindex2_daids:
nWordsWithLabel = np.zeros(nWords)
for daid in daids:
wxs = daid2_wxs[daid]
nWordsWithLabel[wxs] += 1
pcntLblsWithWord += (1 - nWordsWithLabel.astype(np.float64) / len(daids))
# Labels for each word
idf_list = np.log(np.divide(nLabels, np.add(pcntLblsWithWord, 1),
dtype=hstypes.FLOAT_TYPE),
dtype=hstypes.FLOAT_TYPE)
return idf_list
#@ut.cached_func('smk_rvecs_', appname='smk')
@profile
[docs]def compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid,
idx2_fx, aggregate, verbose=False):
"""
Computes residual vectors based on word assignments
returns mapping from word index to a set of residual vectors
Args:
words (ndarray):
wx2_idxs (dict):
wx2_maws (dict):
idx2_vec (dict):
idx2_aid (dict):
idx2_fx (dict):
aggregate (bool):
verbose (bool):
Returns:
tuple : (wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws) formatted as::
* wx2_rvecs - [ ... [ rvec_i1, ..., rvec_Mi ]_i ... ]
* wx2_aids - [ ... [ aid_i1, ..., aid_Mi ]_i ... ]
* wx2_fxs - [ ... [[fxs]_i1, ..., [fxs]_Mi ]_i ... ]
For every word::
* list of aggvecs
* For every aggvec:
* one parent aid, if aggregate is False: assert isunique(aids)
* list of parent fxs, if aggregate is True: assert len(fxs) == 1
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_debug
>>> ibs, annots_df, daids, qaids, invindex, wx2_idxs, qparams = smk_debug.testdata_raw_internals1()
>>> words = invindex.words
>>> idx2_aid = invindex.idx2_daid
>>> idx2_fx = invindex.idx2_dfx
>>> idx2_vec = invindex.idx2_dvec
>>> aggregate = ibs.cfg.query_cfg.smk_cfg.aggregate
>>> wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags = compute_residuals_(words, wx2_idxs, wx2_maws, idx2_vec, idx2_aid, idx2_fx, aggregate)
"""
if not ut.QUIET:
print('[smk_index.rvec] +--- Start Compute Residuals')
wx_sublist = np.array(wx2_idxs.keys())
# Build lists w.r.t. words
idxs_list = [wx2_idxs[wx].astype(hstypes.INDEX_TYPE) for wx in wx_sublist]
aids_list = [idx2_aid.take(idxs) for idxs in idxs_list]
if ut.DEBUG2:
#assert np.all(np.diff(wx_sublist) == 1), 'not dense'
assert all([len(a) == len(b) for a, b in zip(idxs_list, aids_list)]), 'bad alignment'
assert idx2_vec.shape[0] == idx2_fx.shape[0]
assert idx2_vec.shape[0] == idx2_aid.shape[0]
# Prealloc output
if ut.VERBOSE or verbose:
lbl = '[smk_index.rvec] agg rvecs' if aggregate else '[smk_index.rvec] nonagg rvecs'
print(lbl)
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.check_wx2_idxs(wx2_idxs, len(words))
# Compute Residuals
rvecs_list, flags_list = smk_residuals.compute_nonagg_rvecs(words, idx2_vec, wx_sublist, idxs_list)
if ut.VERBOSE:
print('Computed size(rvecs_list) = %r' % ut.get_object_size_str(rvecs_list))
print('Computed size(flags_list) = %r' % ut.get_object_size_str(flags_list))
if aggregate:
maws_list = [wx2_maws[wx] for wx in wx_sublist]
# Aggregate Residuals
tup = smk_residuals.compute_agg_rvecs(rvecs_list, idxs_list, aids_list, maws_list)
(aggvecs_list, aggaids_list, aggidxs_list, aggmaws_list, aggflags_list) = tup
# Pack into common query structure
aggfxs_list = [[idx2_fx.take(idxs) for idxs in aggidxs] for aggidxs in aggidxs_list]
wx2_aggvecs = dict(zip(wx_sublist, aggvecs_list))
wx2_aggaids = dict(zip(wx_sublist, aggaids_list))
wx2_aggfxs = dict(zip(wx_sublist, aggfxs_list))
wx2_aggmaws = dict(zip(wx_sublist, aggmaws_list))
wx2_aggflags = dict(zip(wx_sublist, aggflags_list))
(wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags) = (
wx2_aggvecs, wx2_aggaids, wx2_aggfxs, wx2_aggmaws, wx2_aggflags)
else:
# Hack non-aggregate residuals to have the same structure as aggregate
# residuals for compatability: i.e. each rvec gets a list of fxs that
# contributed to it, and for SMK this is a list of size 1
fxs_list = [[idx2_fx[idx:idx + 1] for idx in idxs] for idxs in idxs_list]
wx2_rvecs = dict(zip(wx_sublist, rvecs_list))
wx2_aids = dict(zip(wx_sublist, aids_list))
wx2_fxs = dict(zip(wx_sublist, fxs_list))
wx2_flags = dict(zip(wx_sublist, flags_list))
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.check_wx2(words, wx2_rvecs, wx2_aids, wx2_fxs)
if ut.VERBOSE or verbose:
print('[smk_index.rvec] L___ End Compute Residuals')
return wx2_rvecs, wx2_aids, wx2_fxs, wx2_maws, wx2_flags
#@ut.cached_func('sccw', appname='smk', key_argx=[1, 2])
@profile
[docs]def compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf,
wx2_dmaws, smk_alpha, smk_thresh, verbose=False):
"""
Computes sccw normalization scalar for the database annotations.
This is gamma from the SMK paper.
sccw is a self consistency critiron weight --- a scalar which ensures
the score of K(X, X) = 1
Args:
idx2_daid ():
wx2_drvecs ():
wx2_aids ():
wx2_idf ():
wx2_dmaws ():
smk_alpha ():
smk_thresh ():
Returns:
daid2_sccw
Example:
>>> # SLOW_DOCTEST
>>> from ibeis.algo.hots.smk.smk_index import * # NOQA
>>> from ibeis.algo.hots.smk import smk_index
>>> from ibeis.algo.hots.smk import smk_debug
>>> #tup = smk_debug.testdata_compute_data_sccw(db='testdb1')
>>> tup = smk_debug.testdata_compute_data_sccw(db='PZ_MTEST')
>>> ibs, annots_df, invindex, wx2_idxs, wx2_idf, wx2_drvecs, wx2_aids, qparams = tup
>>> wx2_dflags = invindex.wx2_dflags
>>> ws2_idxs = invindex.wx2_idxs
>>> wx2_dmaws = invindex.wx2_dmaws
>>> idx2_daid = invindex.idx2_daid
>>> daids = invindex.daids
>>> smk_alpha = qparams.smk_alpha
>>> smk_thresh = qparams.smk_thresh
>>> wx2_idf = wx2_idf
>>> verbose = True
>>> invindex.invindex_dbgstr()
>>> invindex.report_memory()
>>> invindex.report_memsize()
>>> daid2_sccw = smk_index.compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_dflags, wx2_aids, wx2_idf, wx2_dmaws, smk_alpha, smk_thresh, verbose)
"""
#for wx in wx_sublist:
# print(len(wx2_dmaws
verbose_ = ut.VERBOSE or verbose
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
if not ut.QUIET:
print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
if verbose_:
print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
# Group by daids first and then by word index
# Get list of aids and rvecs w.r.t. words (ie one item per word)
wx_sublist = np.array(list(wx2_drvecs.keys()))
aids_perword = [wx2_aids[wx] for wx in wx_sublist]
# wx_list1: Lays out word indexes for each annotation
# tx_list1: Temporary within annotation subindex + wx uniquely identifies
# item in wx2_drvecs, wx2_dflags, and wx2_dmaws
# Flatten out indexes to perform grouping
flat_aids = np.hstack(aids_perword)
count = len(flat_aids)
txs_perword = [np.arange(aids.size) for aids in aids_perword]
flat_txs = np.hstack(txs_perword)
# fromiter is faster for flat_wxs because is not a list of numpy arrays
wxs_perword = ([wx] * len(aids) for wx, aids in zip(wx_sublist, aids_perword))
flat_wxs = np.fromiter(ut.iflatten(wxs_perword), hstypes.INDEX_TYPE, count)
# Group flat indexes by annotation id
unique_aids, annot_groupxs = clustertool.group_indices(flat_aids)
# Wxs and Txs grouped by annotation id
wxs_perannot = clustertool.apply_grouping_iter(flat_wxs, annot_groupxs)
txs_perannot = clustertool.apply_grouping_iter(flat_txs, annot_groupxs)
# Group by word inside each annotation group
wxsubgrouping_perannot = [clustertool.group_indices(wxs)
for wxs in wxs_perannot]
word_groupxs_perannot = (groupxs for wxs, groupxs in wxsubgrouping_perannot)
txs_perword_perannot = [clustertool.apply_grouping(txs, groupxs)
for txs, groupxs in
zip(txs_perannot, word_groupxs_perannot)]
wxs_perword_perannot = [wxs for wxs, groupxs in wxsubgrouping_perannot]
# Group relavent data for sccw measure by word for each annotation grouping
def _vector_subgroup_by_wx(wx2_arr, wxs_perword_perannot, txs_perword_perannot):
return [[wx2_arr[wx].take(txs, axis=0)
for wx, txs in zip(wx_perword_, txs_perword_)]
for wx_perword_, txs_perword_ in
zip(wxs_perword_perannot, txs_perword_perannot)]
def _scalar_subgroup_by_wx(wx2_scalar, wxs_perword_perannot):
return [[wx2_scalar[wx] for wx in wxs] for wxs in wxs_perword_perannot]
subgrouped_drvecs = _vector_subgroup_by_wx(wx2_drvecs, wxs_perword_perannot, txs_perword_perannot)
subgrouped_dmaws = _vector_subgroup_by_wx(wx2_dmaws, wxs_perword_perannot, txs_perword_perannot)
# If we aren't using dmaws replace it with an infinite None iterator
#subgrouped_dmaws = iter(lambda: None, 1)
subgrouped_dflags = _vector_subgroup_by_wx(wx2_dflags, wxs_perword_perannot, txs_perword_perannot)
#subgrouped_dflags = iter(lambda: None, 1)
subgrouped_idfs = _scalar_subgroup_by_wx(wx2_idf, wxs_perword_perannot)
if verbose_:
progiter = ut.ProgressIter(lbl='[smk_index.sccw] SCCW Sum (over daid): ',
total=len(unique_aids), freq=10, with_time=WITH_TOTALTIME)
else:
progiter = ut.identity
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.check_data_smksumm(subgrouped_idfs, subgrouped_drvecs)
sccw_list = [
smk_scoring.sccw_summation(rvecs_list, flags_list, idf_list, maws_list, smk_alpha, smk_thresh)
for rvecs_list, flags_list, maws_list, idf_list in
progiter(zip(subgrouped_drvecs, subgrouped_dflags, subgrouped_dmaws, subgrouped_idfs))
]
daid2_sccw = dict(zip(unique_aids, sccw_list))
if verbose_:
print('[smk_index.sccw] L___ End Compute Data SCCW\n')
return daid2_sccw
@profile
[docs]def OLD_compute_data_sccw_(idx2_daid, wx2_drvecs, wx2_aids, wx2_idf, wx2_dmaws,
smk_alpha, smk_thresh, verbose=False):
"""
"""
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.rrr()
smk_debug.check_wx2(wx2_rvecs=wx2_drvecs, wx2_aids=wx2_aids)
with ut.Timer('timer_orig1'):
wx_sublist = np.array(wx2_drvecs.keys())
if not ut.QUIET:
print('\n[smk_index.sccw] +--- Start Compute Data Self Consistency Weight')
if ut.VERBOSE or verbose:
print('[smk_index.sccw] Compute SCCW smk_alpha=%r, smk_thresh=%r: ' % (smk_alpha, smk_thresh))
# Get list of aids and rvecs w.r.t. words
aids_list = [wx2_aids[wx] for wx in wx_sublist]
rvecs_list1 = [wx2_drvecs[wx] for wx in wx_sublist]
maws_list = [wx2_dmaws[wx] for wx in wx_sublist]
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.assert_single_assigned_maws(maws_list)
# Group by daids first and then by word index
daid2_wx2_drvecs = clustertool.double_group(wx_sublist, aids_list, rvecs_list1)
# For every daid, compute its sccw using pregrouped rvecs
# Summation over words for each aid
if ut.VERBOSE or verbose:
print('[smk_index.sccw] SCCW Sum (over daid): ')
# Get lists w.r.t daids
aid_list = list(daid2_wx2_drvecs.keys())
# list of mappings from words to rvecs foreach daid
# [wx2_aidrvecs_1, ..., wx2_aidrvecs_nDaids,]
_wx2_aidrvecs_list = list(daid2_wx2_drvecs.values())
_aidwxs_iter = (list(wx2_aidrvecs.keys()) for wx2_aidrvecs in _wx2_aidrvecs_list)
aidrvecs_list = [list(wx2_aidrvecs.values()) for wx2_aidrvecs in _wx2_aidrvecs_list]
aididf_list = [[wx2_idf[wx] for wx in aidwxs] for aidwxs in _aidwxs_iter]
with ut.Timer('timer_orig2'):
if ut.DEBUG2:
from ibeis.algo.hots.smk import smk_debug
smk_debug.check_data_smksumm(aididf_list, aidrvecs_list)
# TODO: implement database side soft-assign
sccw_list = [smk_scoring.sccw_summation(rvecs_list, None, idf_list, None, smk_alpha, smk_thresh)
for idf_list, rvecs_list in zip(aididf_list, aidrvecs_list)]
daid2_sccw = dict(zip(aid_list, sccw_list))
if ut.VERBOSE or verbose:
print('[smk_index.sccw] L___ End Compute Data SCCW\n')
return daid2_sccw