Source code for ibeis.dbio.ingest_database

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
"""
This module lists known raw databases and how to ingest them.

Specify arguments and run the following command to ingest a database

python -m ibeis --tf ingest_rawdata --db seaturtles  --imgdir "~/turtles/Turtles from Jill" --ingest-type=named_folders --species=turtles

# --- GET DATA ---
rsync -avhzP <user>@<host>:<remotedir>  <path-to-raw-imgs>
# --- RUN INGEST SCRIPT ---
python -m ibeis --tf ingest_rawdata --db <new-ibeis-db-name> --imgdir <path-to-raw-imgs> --ingest-type=named_folders --species=<optional> --fmtkey=<optional>
"""
from __future__ import absolute_import, division, print_function
from six.moves import zip, map, range
import ibeis
import os
from os.path import relpath, dirname, exists, join, realpath, basename
from ibeis.other import ibsfuncs
from ibeis import constants as const
import utool as ut
import vtool as vt
import parse


[docs]class Ingestable(object): """ Temporary structure representing how to ingest a databases """ def __init__(self, dbname, img_dir=None, ingest_type=None, fmtkey=None, adjust_percent=0.0, postingest_func=None, zipfile=None, species=None, images_as_annots=True): self.dbname = dbname self.img_dir = img_dir self.ingest_type = ingest_type self.fmtkey = fmtkey self.zipfile = zipfile self.adjust_percent = adjust_percent self.postingest_func = postingest_func self.species = species self.images_as_annots = images_as_annots self.ensure_feasibility() def __str__(self): return ut.dict_str(self.__dict__)
[docs] def ensure_feasibility(self): rawdir = ibeis.sysres.get_rawdir() if self.img_dir is None: # Try to find data either the raw or work dir self.img_dir = ibeis.sysres.db_to_dbdir( self.dbname, extra_workdirs=[rawdir], allow_newdir=True) msg = 'Cannot find img_dir for dbname=%r, img_dir=%r' % (self.dbname, self.img_dir) assert self.img_dir is not None, msg #from os.path import isabs #if not isabs(self.img_dir): # self.img_dir = join(dbdir, self.img_dir) self.img_dir = ut.truepath(self.img_dir) assert exists(self.img_dir), msg #if self.ingest_type == 'named_folders': # assert self.fmtkey == 'name'
[docs]class Ingestable2(object): def __init__(self, dbdir, imgpath_list=None, imgdir_list=None, zipfile_list=None, postingest_func=None, ingest_config={}, **kwargs): self.dbdir = dbdir self.zipfile_list = zipfile_list self.imgdir_list = imgdir_list self.imgpath_list = imgpath_list self.postingest_func = postingest_func import dtool # valid_species = None valid_species = ['____'] class IngestConfig(dtool.Config): _param_info_list = [ ut.ParamInfo( 'images_as_annots', True), ut.ParamInfo( 'ingest_type', 'unknown', valid_values=['unknown', 'named_folders', 'named_images']), ut.ParamInfo( 'species', '____', hideif=lambda cfg: not cfg['images_as_annots'], valid_values=valid_species, ), ut.ParamInfo( 'adjust_percent', 0.0, hideif=lambda cfg: not cfg['images_as_annots']), ] updatekw = kwargs.copy() updatekw.update(ingest_config) self.ingest_config = IngestConfig(**updatekw)
[docs] def execute(self, ibs=None): print('[ingest_rawdata] Ingestable' + str(self)) assert ibs is not None unzipped_file_base_dir = join(ibs.get_dbdir(), 'unzipped_files') def extract_from_zipfiles(zipfile_list): ut.ensuredir(unzipped_file_base_dir) for zipfile in zipfile_list: img_dir = unzipped_file_base_dir unziped_file_relpath = dirname(relpath(relpath(realpath(zipfile), realpath(img_dir)))) unzipped_file_dir = join(unzipped_file_base_dir, unziped_file_relpath) ut.ensuredir(unzipped_file_dir) ut.unzip_file(zipfile, output_dir=unzipped_file_dir, overwrite=False) gpath_list = ut.list_images(unzipped_file_dir, fullpath=True, recursive=True) return gpath_list def list_images(img_dir): """ lists images that are not in an internal cache """ import utool as ut # NOQA ignore_list = ['_hsdb', '.hs_internals', '_ibeis_cache', '_ibsdb'] gpath_list = ut.list_images(img_dir, fullpath=True, recursive=True, ignore_list=ignore_list) return gpath_list # FIXME ensure python3 works with this gpath_list = [] if self.imgpath_list is not None: gpath_list += self.imgpath_list if self.imgdir_list is not None: for img_dir in self.imgdir_list: gpath_list += ut.ensure_unicode_strlist(list_images(img_dir)) if self.zipfile_list is not None: gpath_list += extract_from_zipfiles(self.zipfile_list) gpath_list = ut.ensure_unicode_strlist(gpath_list) # Parse structure for image names ingest_type = self.ingest_config.ingest_type if ingest_type == 'named_folders': name_list = get_name_texts_from_parent_folder(gpath_list, img_dir, None) pass elif ingest_type == 'named_images': name_list = get_name_texts_from_gnames(gpath_list, img_dir, None) elif ingest_type == 'unknown': name_list = [const.UNKNOWN for _ in range(len(gpath_list))] else: raise NotImplementedError('unknwon ingest_type=%r' % (ingest_type,)) # Add Images gpath_list = [gpath.replace('\\', '/') for gpath in gpath_list] gid_list_ = ibs.add_images(gpath_list) # <DEBUG> #print('added: ' + ut.indentjoin(map(str, zip(gid_list_, gpath_list)))) unique_gids = list(set(gid_list_)) print("[ingest] Length gid list: %d" % len(gid_list_)) print("[ingest] Length unique gid list: %d" % len(unique_gids)) assert len(gid_list_) == len(gpath_list) for gid in gid_list_: if gid is None: print('[ingest] big fat warning') # </DEBUG> gid_list = ut.filter_Nones(gid_list_) unique_gids, unique_names, unique_notes = resolve_name_conflicts( gid_list, name_list) # Add ANNOTATIONs with names and notes if self.ingest_config.images_as_annots: aid_list = ibs.use_images_as_annotations(unique_gids, adjust_percent=self.ingest_config.adjust_percent) ibs.set_annot_names(aid_list, unique_names) ibs.set_annot_notes(aid_list, unique_notes) species_text = self.ingest_config.species if species_text is not None: ibs.set_annot_species(aid_list, [species_text] * len(aid_list)) localize = False if localize: ibs.localize_images() if self.postingest_func is not None: self.postingest_func(ibs) return gid_list
[docs]def ingest_rawdata(ibs, ingestable, localize=False): """ Ingests rawdata into an ibeis database. Args: ibs (ibeis.IBEISController): ibeis controller object ingestable (Ingestable): localize (bool): (default = False) Returns: list: aid_list - list of annotation rowids Notes: if ingest_type == 'named_folders': Converts folder structure where folders = name, to ibsdb if ingest_type == 'named_images': Converts imgname structure where imgnames = name_id.ext, to ibsdb CommandLine: python ibeis/dbio/ingest_database.py --db seals_drop2 python -m ibeis.dbio.ingest_database --exec-ingest_rawdata python -m ibeis.dbio.ingest_database --exec-ingest_rawdata --db snow-leopards --imgdir /raid/raw_rsync/snow-leopards python -m ibeis --tf ingest_rawdata --db wd_peter2 --imgdir /raid/raw_rsync/african-dogs --ingest-type=named_folders --species=wild_dog --fmtkey='African Wild Dog: {name}' --force-delete python -m ibeis --tf ingest_rawdata --db <newdbname> --imgdir <path-to-images> --ingest-type=named_folders --species=humpback Example: >>> # SCRIPT >>> # General ingest script >>> from ibeis.dbio.ingest_database import * # NOQA >>> import ibeis >>> dbname = ut.get_argval('--db', str, None) # 'snow-leopards') >>> force_delete = ut.get_argflag(('--force_delete', '--force-delete')) >>> img_dir = ut.get_argval('--imgdir', type_=str, default=None) >>> ingest_type = ut.get_argval('--ingest-type', type_=str, default='unknown') >>> fmtkey = ut.get_argval('--fmtkey', type_=str, default=None) >>> species = ut.get_argval('--species', type_=str, default=None) >>> assert img_dir is not None, 'specify img dir' >>> assert dbname is not None, 'specify dbname' >>> ingestable = Ingestable( >>> dbname, img_dir=img_dir, ingest_type=ingest_type, >>> fmtkey=fmtkey, species=species, images_as_annots=ingest_type != 'unknown', >>> adjust_percent=0.00) >>> from ibeis.control import IBEISControl >>> dbdir = ibeis.sysres.db_to_dbdir(dbname, allow_newdir=True, use_sync=False) >>> ut.ensuredir(dbdir, verbose=True) >>> if force_delete: >>> ibsfuncs.delete_ibeis_database(dbdir) >>> ibs = IBEISControl.request_IBEISController(dbdir) >>> localize = False >>> aid_list = ingest_rawdata(ibs, ingestable, localize) >>> result = ('aid_list = %s' % (str(aid_list),)) >>> print(result) """ print('[ingest_rawdata] Ingestable' + str(ingestable)) if ingestable.zipfile is not None: zipfile_fpath = ut.truepath(join(ibeis.sysres.get_workdir(), ingestable.zipfile)) ingestable.img_dir = ut.unarchive_file(zipfile_fpath) img_dir = realpath(ingestable.img_dir) ingest_type = ingestable.ingest_type fmtkey = ingestable.fmtkey adjust_percent = ingestable.adjust_percent species_text = ingestable.species postingest_func = ingestable.postingest_func print('[ingest] ingesting rawdata: img_dir=%r, injest_type=%r' % (img_dir, ingest_type)) # Get images in the image directory unzipped_file_base_dir = join(ibs.get_dbdir(), 'unzipped_files') def extract_zipfile_images(ibs, ingestable): import utool as ut # NOQA zipfile_list = ut.glob(ingestable.img_dir, '*.zip', recursive=True) if len(zipfile_list) > 0: print('Found zipfile_list = %r' % (zipfile_list,)) ut.ensuredir(unzipped_file_base_dir) for zipfile in zipfile_list: unziped_file_relpath = dirname(relpath(relpath(realpath(zipfile), realpath(ingestable.img_dir)))) unzipped_file_dir = join(unzipped_file_base_dir, unziped_file_relpath) ut.ensuredir(unzipped_file_dir) ut.unzip_file(zipfile, output_dir=unzipped_file_dir, overwrite=False) gpath_list = ut.list_images(unzipped_file_dir, fullpath=True, recursive=True) else: gpath_list = [] return gpath_list def list_images(img_dir): """ lists images that are not in an internal cache """ import utool as ut # NOQA ignore_list = ['_hsdb', '.hs_internals', '_ibeis_cache', '_ibsdb'] gpath_list = ut.list_images(img_dir, fullpath=True, recursive=True, ignore_list=ignore_list) return gpath_list # FIXME ensure python3 works with this gpath_list1 = ut.ensure_unicode_strlist(list_images(img_dir)) gpath_list2 = ut.ensure_unicode_strlist(extract_zipfile_images(ibs, ingestable)) gpath_list = gpath_list1 + gpath_list2 # Parse structure for image names if ingest_type == 'named_folders': name_list1 = get_name_texts_from_parent_folder(gpath_list1, img_dir, fmtkey) name_list2 = get_name_texts_from_parent_folder(gpath_list2, unzipped_file_base_dir, fmtkey) name_list = name_list1 + name_list2 pass elif ingest_type == 'named_images': name_list = get_name_texts_from_gnames(gpath_list, img_dir, fmtkey) elif ingest_type == 'unknown': name_list = [const.UNKNOWN for _ in range(len(gpath_list))] else: raise NotImplementedError('unknwon ingest_type=%r' % (ingest_type,)) # Find names likely to be the same? RECTIFY_NAMES_HUERISTIC = True if RECTIFY_NAMES_HUERISTIC: names = sorted(list(set(name_list))) splitchars = [' ', '/'] def multisplit(str_, splitchars): import utool as ut n = [str_] for char in splitchars: n = ut.flatten([_.split(char) for _ in n]) return n groupids = [multisplit(n1, splitchars)[0] for n1 in names] grouped_names = ut.group_items(names, groupids) fixed_names = { newkey: key for key, val in grouped_names.items() if len(val) > 1 for newkey in val } name_list = [fixed_names.get(name, name) for name in name_list] # Add Images gpath_list = [gpath.replace('\\', '/') for gpath in gpath_list] gid_list_ = ibs.add_images(gpath_list) # <DEBUG> #print('added: ' + ut.indentjoin(map(str, zip(gid_list_, gpath_list)))) unique_gids = list(set(gid_list_)) print("[ingest] Length gid list: %d" % len(gid_list_)) print("[ingest] Length unique gid list: %d" % len(unique_gids)) assert len(gid_list_) == len(gpath_list) for gid in gid_list_: if gid is None: print('[ingest] big fat warning') # </DEBUG> gid_list = ut.filter_Nones(gid_list_) unique_gids, unique_names, unique_notes = resolve_name_conflicts( gid_list, name_list) # Add ANNOTATIONs with names and notes if ingestable.images_as_annots: aid_list = ibs.use_images_as_annotations(unique_gids, adjust_percent=adjust_percent) ibs.set_annot_names(aid_list, unique_names) ibs.set_annot_notes(aid_list, unique_notes) if species_text is not None: ibs.set_annot_species(aid_list, [species_text] * len(aid_list)) if localize: ibs.localize_images() TURTLE_HURISTIC = 'turtles' in img_dir if TURTLE_HURISTIC: """ python -m ibeis --tf ingest_rawdata --db seaturtles --imgdir "~/turtles/Turtles from Jill" --ingest-type=named_folders --species=turtles """ aid_list = ibs.get_valid_aids() parent_gids = ibs.get_annot_gids(aid_list) annot_orig_uris = ibs.get_image_uris_original(parent_gids) def parse_turtle_uri(uri): from os.path import splitext, dirname, basename info = {} uril = uri.lower() def findany(text, possible): return any([x in text for x in possible]) if findany(uril, ['right']) or splitext(uril)[0].endswith('rs'): info['view'] = 'right' if findany(uril, ['left']) or splitext(uril)[0].endswith('ls'): info['view'] = 'left' if findany(uril, ['carapace', 'whole', 'carpace']) or splitext(uril)[0].endswith('wb'): #info['view'] = 'top' info['view'] = 'up' occurrence_id = basename(dirname(uri)) info['occurrence'] = 'occurrence' + occurrence_id return info turtle_info_list = [parse_turtle_uri(uri) for uri in annot_orig_uris] view_text_list = ut.take_column(turtle_info_list, 'view') occur_text_list = ut.take_column(turtle_info_list, 'occurrence') turtle_tag_list = list(zip(occur_text_list, view_text_list)) # TODO: mark viewpoints using euler angles / quaternions ibs.set_image_imagesettext(parent_gids, occur_text_list) ibs.append_annot_case_tags(aid_list, ut.lmap(list, turtle_tag_list)) if postingest_func is not None: postingest_func(ibs) # Print to show success #ibs.print_image_table() #ibs.print_tables() #ibs.print_annotation_table() #ibs.print_alr_table() #ibs.print_lblannot_table() #ibs.print_image_table() #return aid_list
[docs]def normalize_name(name): """ Maps unknonwn names to the standard ____ """ if name in const.ACCEPTED_UNKNOWN_NAMES: name = const.INDIVIDUAL_KEY return name
[docs]def get_name_texts_from_parent_folder(gpath_list, img_dir, fmtkey=None): """ Input: gpath_list Output: names based on the parent folder of each image """ #from os.path import commonprefix relgpath_list = [relpath(gpath, img_dir) for gpath in gpath_list] #_prefix = commonprefix(gpath_list) #relgpath_list = [relpath(gpath, _prefix) for gpath in gpath_list] _name_list = [dirname(relgpath) for relgpath in relgpath_list] if fmtkey is not None: #fmtkey = 'African Wild Dog: {name}' import parse parse_results = [parse.parse(fmtkey, name) for name in _name_list] _name_list = [res['name'] if res is not None else name for name, res in zip(_name_list, parse_results)] name_list = list(map(normalize_name, _name_list)) return name_list
[docs]class FMT_KEYS(object): name_fmt = '{name:*}[id:d].{ext}' snails_fmt = '{name:*dd}{id:dd}.{ext}' giraffe1_fmt = '{name:*}_{id:d}.{ext}' seal2_fmt = '{name:Phsd*}{id:[A-Z]}.{ext}' elephant_fmt = '{prefix?}{name}_{view}_{id?}.{ext}'
[docs]def get_name_texts_from_gnames(gpath_list, img_dir, fmtkey='{name:*}[aid:d].{ext}'): """ Args: gpath_list (list): list of image paths img_dir (str): path to image directory fmtkey (str): pattern string to parse names from (default = '{name:*}[aid:d].{ext}') Returns: list: name_list - based on the parent folder of each image CommandLine: python -m ibeis.dbio.ingest_database --test-get_name_texts_from_gnames Example: >>> # DISABLE_DOCTEST >>> from ibeis.dbio.ingest_database import * # NOQA >>> gpath_list = ['e_f0273_f.jpg', 'f0001_f.jpg', 'f0259_l_3.jpg', 'f0259_f_1.jpg', 'f0259_f (1).jpg', 'f0058_u16_f.jpg'] >>> img_dir = '' >>> fmtkey = FMT_KEYS.elephant_fmt >>> result = get_name_texts_from_gnames(gpath_list, img_dir, fmtkey) >>> print(result) """ # These define regexes that attempt to parse the insane and contradicting # naming schemes of the image sets that we get. INGEST_FORMATS = { FMT_KEYS.name_fmt: ut.named_field_regex([ ('name', r'[a-zA-Z]+'), # all alpha characters ('id', r'\d*'), # first numbers (if existant) ( None, r'\.'), ('ext', r'\w+'), ]), FMT_KEYS.snails_fmt: ut.named_field_regex([ ('name', r'[a-zA-Z]+\d\d'), # species and 2 numbers ('id', r'\d\d'), # 2 more numbers ( None, r'\.'), ('ext', r'\w+'), ]), FMT_KEYS.giraffe1_fmt: ut.named_field_regex([ ('name', r'G\d+'), ('under', r'_'), ('id', r'\d+'), ( None, r'\.'), ('ext', r'\w+'), ]), FMT_KEYS.seal2_fmt: ut.named_field_regex([ ('name', r'Phs\d+'), # Phs and then numbers ('id', r'[A-Z]+'), # 1 or more letters ( None, r'\.'), ('ext', r'\w+'), ]), # this one defines multiple possible regex types. yay standards FMT_KEYS.elephant_fmt: [ ut.named_field_regex([ ('prefix', r'(e_)?'), ('name', r'[a-zA-Z0-9]+'), ('view', r'_[rflo]'), ('id', r'([ _][^.]+)?'), ( None, r'\.'), ('ext', r'\w+'), ]), ut.named_field_regex([ ('prefix', r'(e_)?'), ('name', r'[a-zA-Z0-9]+'), ('id', r'([ _][^.]+)?'), ('view', r'_[rflo]'), ( None, r'\.'), ('ext', r'\w+'), ])], } regex_list = INGEST_FORMATS.get(fmtkey, fmtkey) gname_list = ut.fpaths_to_fnames(gpath_list) def parse_format(regex_list, gname): if not isinstance(regex_list, list): regex_list = [regex_list] for regex in regex_list: result = ut.regex_parse(regex, gname) if result is not None: return result return None parsed_list = [parse_format(regex_list, gname) for gname in gname_list] anyfailed = False for gpath, parsed in zip(gpath_list, parsed_list): if parsed is None: print('FAILED TO PARSE: %r' % gpath) anyfailed = True if anyfailed: msg = ('FAILED REGEX: %r' % regex_list) raise Exception(msg) _name_list = [parsed['name'] for parsed in parsed_list] name_list = list(map(normalize_name, _name_list)) return name_list
[docs]def resolve_name_conflicts(gid_list, name_list): """ """ # Build conflict map (values are lists of members) conflict_gid_to_names = ut.build_conflict_dict(gid_list, name_list) # Check to see which gid has more than one name unique_gids = ut.unique_ordered(gid_list) unique_names = [] unique_notes = [] for gid in unique_gids: names = ut.unique_ordered(conflict_gid_to_names[gid]) unique_name = names[0] unique_note = '' if len(names) > 1: if '____' in names: names.remove('____') if len(names) == 1: unique_name = names[0] else: unique_name = names[0] unique_note = 'aliases([' + ', '.join(map(repr, names[1:])) + '])' unique_names.append(unique_name) unique_notes.append(unique_note) return unique_gids, unique_names, unique_notes # # ### <STANDARD DATABASES> ###
STANDARD_INGEST_FUNCS = {} def __standard(dbname): """ Decorates a function as a standard ingestable database """ def __registerdb(func): STANDARD_INGEST_FUNCS[dbname] = func return func return __registerdb @__standard('testdb1')
[docs]def ingest_testdb1(dbname): """ ingest_testdb1 Example: >>> # DISABLE_DOCTEST >>> from ibeis.dbio.ingest_database import * # NOQA >>> import utool as ut >>> from vtool.tests import grabdata >>> import ibeis >>> grabdata.ensure_testdata() >>> # DELETE TESTDB1 >>> TESTDB1 = ut.unixjoin(ibeis.sysres.get_workdir(), 'testdb1') >>> ut.delete(TESTDB1, ignore_errors=False) >>> result = ingest_testdb1(dbname) """ from vtool.tests import grabdata # TODO: remove and use utool appdir def postingest_tesdb1_func(ibs): import numpy as np from ibeis import constants as const print('postingest_tesdb1_func') # Adjust data as we see fit gid_list = np.array(ibs.get_valid_gids()) # Set image unixtimes unixtimes_even = (gid_list[0::2] + 100).tolist() unixtimes_odd = (gid_list[1::2] + 9001).tolist() unixtime_list = unixtimes_even + unixtimes_odd ibs.set_image_unixtime(gid_list, unixtime_list) # Unname first aid in every name aid_list = ibs.get_valid_aids() nid_list = ibs.get_annot_name_rowids(aid_list) nid_list = [ (nid if nid > 0 else None) for nid in nid_list] unique_flag = ut.flag_unique_items(nid_list) unique_nids = ut.compress(nid_list, unique_flag) none_nids = [ nid is not None for nid in nid_list] flagged_nids = [nid for nid in unique_nids if nid_list.count(nid) > 1] plural_flag = [nid in flagged_nids for nid in nid_list] flag_list = list(map(all, zip(plural_flag, unique_flag, none_nids))) flagged_aids = ut.compress(aid_list, flag_list) if ut.VERYVERBOSE: def print2(*args): print('[post_testdb1] ' + ', '.join(args)) print2('aid_list=%r' % aid_list) print2('nid_list=%r' % nid_list) print2('unique_flag=%r' % unique_flag) print2('plural_flag=%r' % plural_flag) print2('unique_nids=%r' % unique_nids) print2('none_nids=%r' % none_nids) print2('flag_list=%r' % flag_list) print2('flagged_nids=%r' % flagged_nids) print2('flagged_aids=%r' % flagged_aids) # print2('new_nids=%r' % new_nids) # Unname, some annotations for testing unname_aids = ut.compress(aid_list, flag_list) ibs.delete_annot_nids(unname_aids) # Add all annotations with names as exemplars #from ibeis.control.IBEISControl import IBEISController #assert isinstance(ibs, IBEISController) unflagged_aids = ut.get_dirty_items(aid_list, flag_list) exemplar_flags = [True] * len(unflagged_aids) ibs.set_annot_exemplar_flags(unflagged_aids, exemplar_flags) # Set some test species labels species_text_list = ibs.get_annot_species_texts(aid_list) for ix in range(0, 6): species_text_list[ix] = const.TEST_SPECIES.ZEB_PLAIN # These are actually plains zebras. for ix in range(8, 10): species_text_list[ix] = const.TEST_SPECIES.ZEB_GREVY for ix in range(10, 12): species_text_list[ix] = const.TEST_SPECIES.BEAR_POLAR ibs.set_annot_species(aid_list, species_text_list) ibs.set_annot_notes(aid_list[8:10], ['this is actually a plains zebra'] * 2) ibs.set_annot_notes(aid_list[0:1], ['aid 1 and 2 are correct matches']) ibs.set_annot_notes(aid_list[6:7], ['very simple image to debug feature detector']) ibs.set_annot_notes(aid_list[7:8], ['standard test image']) # Set some randomish gps flags that are within nnp unixtime_list = ibs.get_image_unixtime(gid_list) valid_lat_min = -1.4446 valid_lat_max = -1.3271 valid_lon_min = 36.7619 valid_lon_max = 36.9622 valid_lat_range = valid_lat_max - valid_lat_min valid_lon_range = valid_lon_max - valid_lon_min randstate = np.random.RandomState(unixtime_list) new_gps_list = randstate.rand(len(gid_list), 2) new_gps_list[:, 0] = (new_gps_list[:, 0] * valid_lat_range) + valid_lat_min new_gps_list[:, 1] = (new_gps_list[:, 1] * valid_lon_range) + valid_lon_min new_gps_list[8, :] = [-1, -1] ibs.set_image_gps(gid_list, new_gps_list) # TODO: add a nan timestamp ibs.append_annot_case_tags([2], ['error:bbox']) ibs.append_annot_case_tags([4], ['quality:washedout']) ibs.append_annot_case_tags([4], ['lighting']) aidgroups = ibs.group_annots_by_name( ibs.filter_annots_general(min_pername=2, verbose=True))[0] aid1_list = ut.take_column(aidgroups, 0) aid2_list = ut.take_column(aidgroups, 1) annotmatch_rowids = ibs.add_annotmatch(aid1_list, aid2_list) ibs.set_annotmatch_truth(annotmatch_rowids, [True] * len(annotmatch_rowids)) ibs.set_annotmatch_truth(annotmatch_rowids, [True] * len(annotmatch_rowids)) ibs.set_annotmatch_prop('photobomb', annotmatch_rowids, [True] * len(annotmatch_rowids)) for aids in aidgroups: pass return None return Ingestable(dbname, ingest_type='named_images', fmtkey=FMT_KEYS.name_fmt, img_dir=grabdata.get_testdata_dir(), adjust_percent=0.00, postingest_func=postingest_tesdb1_func)
@__standard('humpbacks')
[docs]def ingest_humpbacks(dbname): # The original humpbacks data is ROI cropped images in the # named folder format return Ingestable(dbname, ingest_type='named_folders', adjust_percent=0.00, species='whale_humpback', # this zipfile is only on Zach's machine fmtkey='name')
@__standard('polar_bears')
[docs]def ingest_polar_bears(dbname): return Ingestable(dbname, ingest_type='named_folders', adjust_percent=0.00, fmtkey='name')
@__standard('wd_peter_blinston')
[docs]def ingest_wilddog_peter(dbname): """ CommandLine: python -m ibeis.dbio.ingest_database --exec-injest_main --db wd_peter_blinston """ return Ingestable(dbname, ingest_type='unknown', img_dir='/raid/raw_rsync/african-dogs', adjust_percent=0.01, species=const.Species.WILDDOG)
@__standard('lynx')
[docs]def ingest_lynx(dbname): """ CommandLine: python -m ibeis.dbio.ingest_database --exec-injest_main --db lynx """ return Ingestable(dbname, ingest_type='named_folders', img_dir='/raid/raw_rsync/iberian-lynx/CARPETAS CATALOGO INDIVIDUOS/', adjust_percent=0.01, species='lynx', fmtkey='name')
@__standard('WS_ALL')
[docs]def ingest_whale_sharks(dbname): """ CommandLine: python -m ibeis.dbio.ingest_database --exec-injest_main --db WS_ALL """ return Ingestable(dbname, ingest_type='named_folders', img_dir='named-left-sharkimages', adjust_percent=0.01, species='whale_shark', fmtkey='name')
@__standard('snails_drop1')
[docs]def ingest_snails_drop1(dbname): return Ingestable(dbname, ingest_type='named_images', fmtkey=FMT_KEYS.snails_fmt, species='snail', #img_dir='/raid/raw/snails_drop1_59MB', adjust_percent=.20)
@__standard('seals_drop2')
[docs]def ingest_seals_drop2(dbname): return Ingestable(dbname, zipfile='../raw/hiby_Phs_photos.zip', ingest_type='named_images', fmtkey=FMT_KEYS.seal2_fmt, #img_dir='/raid/raw/snails_drop1_59MB', adjust_percent=.20, species='seal_saimma_ringed' )
@__standard('JAG_Kieryn')
[docs]def ingest_JAG_Kieryn(dbname): return Ingestable(dbname, ingest_type='unknown', species='jaguar', adjust_percent=0.00)
@__standard('Giraffes')
[docs]def ingest_Giraffes1(dbname): return Ingestable(dbname, ingest_type='named_images', fmtkey=FMT_KEYS.giraffe1_fmt, species='giraffe_reticulated', adjust_percent=0.00)
@__standard('Elephants_drop1')
[docs]def ingest_Elephants_drop1(dbname): return Ingestable(dbname, zipfile='../raw_unprocessed/ID photo front_Elephants_4-29-2015-PeterGranli.zip', # NOQA ingest_type='named_images', fmtkey=FMT_KEYS.elephant_fmt, species='elephant_savanna', adjust_percent=0.00)
[docs]def get_standard_ingestable(dbname): if dbname in STANDARD_INGEST_FUNCS: return STANDARD_INGEST_FUNCS[dbname](dbname) else: raise AssertionError('Unknown dbname=%r' % (dbname,))
[docs]def ingest_standard_database(dbname, force_delete=False): """ ingest_standard_database Args: dbname (str): database name force_delete (bool): Example: >>> from ibeis.dbio.ingest_database import * # NOQA >>> dbname = 'testdb1' >>> force_delete = False >>> result = ingest_standard_database(dbname, force_delete) >>> print(result) """ from ibeis.control import IBEISControl print('[ingest] Ingest Standard Database: dbname=%r' % (dbname,)) ingestable = get_standard_ingestable(dbname) dbdir = ibeis.sysres.db_to_dbdir(ingestable.dbname, allow_newdir=True, use_sync=False) ut.ensuredir(dbdir, verbose=True) if force_delete: ibsfuncs.delete_ibeis_database(dbdir) ibs = IBEISControl.request_IBEISController(dbdir) ingest_rawdata(ibs, ingestable) ### </STANDARD DATABASES> ### # #
[docs]def ingest_oxford_style_db(dbdir, dryrun=False): """ Ingest either oxford or paris Args: dbdir (str): CommandLine: python -m ibeis.dbio.ingest_database --exec-ingest_oxford_style_db --show Example: >>> # DISABLE_DOCTEST >>> from ibeis.dbio.ingest_database import * # NOQA >>> dbdir = '/raid/work/Oxford' >>> dryrun = True >>> ingest_oxford_style_db(dbdir) >>> ut.quit_if_noshow() >>> import plottool as pt >>> ut.show_if_requested() Ignore: >>> from ibeis.dbio.ingest_database import * # NOQA >>> import ibeis >>> dbdir = '/raid/work/Oxford' >>> dbdir = '/raid/work/Paris' >>> #>>> ibeis.dbio.convert_db.ingest_oxford_style_db(dbdir) """ print('Loading Oxford Style Images from: ' + dbdir) def _parse_oxsty_gtfname(gt_fname): """ parse gtfname for: (gt_name, quality_lbl, num) """ # num is an id, not a number of annots gt_format = '{}_{:d}_{:D}.txt' name, num, quality = parse.parse(gt_format, gt_fname) return (name, num, quality) def _read_oxsty_gtfile(gt_fpath, name, quality, img_dpath, ignore_list): oxsty_annot_info_list = [] # read the individual ground truth file with open(gt_fpath, 'r') as file: line_list = file.read().splitlines() for line in line_list: if line == '': continue fields = line.split(' ') gname = fields[0].replace('oxc1_', '') + '.jpg' # >:( Because PARIS just cant keep paths consistent if gname.find('paris_') >= 0: paris_hack = gname[6:gname.rfind('_')] gname = join(paris_hack, gname) if gname in ignore_list: continue if len(fields) > 1: # if has bbox bbox = [int(round(float(x))) for x in fields[1:]] else: # Get annotation width / height gpath = join(img_dpath, gname) h, w, c = vt.imread(gpath, orient='auto').shape bbox = [0, 0, w, h] oxsty_annot_info = (gname, bbox) oxsty_annot_info_list.append(oxsty_annot_info) return oxsty_annot_info_list gt_dpath = ut.existing_subpath(dbdir, ['oxford_style_gt', 'gt_files_170407', 'oxford_groundtruth']) img_dpath = ut.existing_subpath(dbdir, ['oxbuild_images', 'images']) corrupted_file_fpath = join(gt_dpath, 'corrupted_files.txt') ignore_list = [] # Check for corrupted files (Looking at your Paris Buildings Dataset) if ut.checkpath(corrupted_file_fpath): ignore_list = ut.read_from(corrupted_file_fpath).splitlines() gname_list = ut.list_images(img_dpath, ignore_list=ignore_list, recursive=True, full=False) # just in case utool broke for ignore in ignore_list: assert ignore not in gname_list # Read the Oxford Style Groundtruth files print('Loading Oxford Style Names and Annots') gt_fname_list = os.listdir(gt_dpath) num_gt_files = len(gt_fname_list) query_annots = [] gname2_annots_raw = ut.ddict(list) name_set = set([]) print(' * num_gt_files = %d ' % num_gt_files) # # Iterate over each groundtruth file for gtx, gt_fname in enumerate(ut.ProgIter(gt_fname_list, 'parsed oxsty gtfile: ')): if gt_fname == 'corrupted_files.txt': continue #Get name, quality, and num from fname (name, num, quality) = _parse_oxsty_gtfname(gt_fname) gt_fpath = join(gt_dpath, gt_fname) name_set.add(name) oxsty_annot_info_sublist = _read_oxsty_gtfile( gt_fpath, name, quality, img_dpath, ignore_list) if quality == 'query': for (gname, bbox) in oxsty_annot_info_sublist: query_annots.append((gname, bbox, name, num)) else: for (gname, bbox) in oxsty_annot_info_sublist: gname2_annots_raw[gname].append((name, bbox, quality)) print(' * num_query images = %d ' % len(query_annots)) # # Remove duplicates img.jpg : (*1.txt, *2.txt, ...) -> (*.txt) gname2_annots = ut.ddict(list) multinamed_gname_list = [] for gname, val in gname2_annots_raw.iteritems(): val_repr = list(map(repr, val)) unique_reprs = set(val_repr) unique_indexes = [val_repr.index(urep) for urep in unique_reprs] for ux in unique_indexes: gname2_annots[gname].append(val[ux]) if len(gname2_annots[gname]) > 1: multinamed_gname_list.append(gname) # print some statistics query_gname_list = [tup[0] for tup in query_annots] gname_with_groundtruth_list = gname2_annots.keys() gname_with_groundtruth_set = set(gname_with_groundtruth_list) gname_set = set(gname_list) query_gname_set = set(query_gname_list) gname_without_groundtruth_list = list(gname_set - gname_with_groundtruth_set) print(' * num_images = %d ' % len(gname_list)) print(' * images with groundtruth = %d ' % len(gname_with_groundtruth_list)) print(' * images without groundtruth = %d ' % len(gname_without_groundtruth_list)) print(' * images with multi-groundtruth = %d ' % len(multinamed_gname_list)) #make sure all queries have ground truth and there are no duplicate queries # assert len(query_gname_list) == len(query_gname_set.intersection(gname_with_groundtruth_list)) assert len(query_gname_list) == len(set(query_gname_list)) #======================================================= # Build IBEIS database if not dryrun: ibs = ibeis.opendb(dbdir, allow_newdir=True) ibs.cfg.other_cfg.auto_localize = False print('adding to table: ') # Add images to ibeis gpath_list = [join(img_dpath, gname).replace('\\', '/') for gname in gname_list] gid_list = ibs.add_images(gpath_list) # 1) Add Query Annotations qgname_list, qbbox_list, qname_list, qid_list = zip(*query_annots) # get image ids of queries qgid_list = [gid_list[gname_list.index(gname)] for gname in qgname_list] qnote_list = ['query'] * len(qgid_list) # 2) Add nonquery database annots dgname_list = list(gname2_annots.keys()) # NOQA dgid_list = [] dname_list = [] dbbox_list = [] dnote_list = [] for gname in gname2_annots.keys(): gid = gid_list[gname_list.index(gname)] annots = gname2_annots[gname] for name, bbox, quality in annots: dgid_list.append(gid) dbbox_list.append(bbox) dname_list.append(name) dnote_list.append(quality) # 3) Add distractors: TODO: 100k ugid_list = [gid_list[gname_list.index(gname)] for gname in gname_without_groundtruth_list] ubbox_list = [[0, 0, w, h] for (w, h) in ibs.get_image_sizes(ugid_list)] unote_list = ['distractor'] * len(ugid_list) # TODO Annotation consistency in terms of duplicate bounding boxes qaid_list = ibs.add_annots(qgid_list, bbox_list=qbbox_list, name_list=qname_list, notes_list=qnote_list) daid_list = ibs.add_annots(dgid_list, bbox_list=dbbox_list, name_list=dname_list, notes_list=dnote_list) uaid_list = ibs.add_annots(ugid_list, bbox_list=ubbox_list, notes_list=unote_list) print('Added %d query annototations' % len(qaid_list)) print('Added %d database annototations' % len(daid_list)) print('Added %d distractor annototations' % len(uaid_list)) update = False if update: # TODO: integrate this into normal ingest pipeline 'Oxford' ibs = ibeis.opendb(dbdir) aid_list = ibs.get_valid_aids() notes_list = ibs.get_annot_notes(aid_list) _dict = { 'ok': ibs.const.QUAL_OK, 'good': ibs.const.QUAL_GOOD, 'junk': ibs.const.QUAL_JUNK, #'distractor': ibs.const.QUAL_JUNK } qual_text_list = [_dict.get(note, ibs.const.QUAL_UNKNOWN) for note in notes_list] ibs.set_annot_quality_texts(aid_list, qual_text_list) ibs._overwrite_all_annot_species_to('building') tags_list = [[note] if note in ['query', 'distractor'] else [] for note in notes_list] from ibeis import tag_funcs tag_funcs.append_annot_case_tags(ibs, aid_list, tags_list) #ibs._set # tags_ = ibs.get_annot_case_tags(aid_list) # pass """ python -m ibeis --tf filter_annots_general --db Oxford --has_any=[query] """
[docs]def ingest_serengeti_mamal_cameratrap(species): """ Downloads data from Serengeti dryad server References: http://datadryad.org/resource/doi:10.5061/dryad.5pt92 Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015) Snapshot Serengeti, high-frequency annotated camera trap images of 40 mammalian species in an African savanna. Scientific Data 2: 150026. http://dx.doi.org/10.1038/sdata.2015.26 Swanson AB, Kosmala M, Lintott CJ, Simpson RJ, Smith A, Packer C (2015) Data from: Snapshot Serengeti, high-frequency annotated camera trap images of 40 mammalian species in an African savanna. Dryad Digital Repository. http://dx.doi.org/10.5061/dryad.5pt92 Args: species (?): CommandLine: python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species zebra_plains python -m ibeis.dbio.ingest_database --test-ingest_serengeti_mamal_cameratrap --species cheetah Example: >>> # SCRIPT >>> from ibeis.dbio.ingest_database import * # NOQA >>> import ibeis >>> species = ut.get_argval('--species', type_=str, default=ibeis.const.TEST_SPECIES.ZEB_PLAIN) >>> # species = ut.get_argval('--species', type_=str, default='cheetah') >>> result = ingest_serengeti_mamal_cameratrap(species) >>> print(result) """ 'https://snapshotserengeti.s3.msi.umn.edu/' import ibeis if species is None: code = 'ALL' elif species == 'zebra_plains': code = 'PZ' elif species == 'cheetah': code = 'CHTH' else: raise NotImplementedError() if species == 'zebra_plains': serengeti_sepcies = 'zebra' else: serengeti_sepcies = species print('species = %r' % (species,)) print('serengeti_sepcies = %r' % (serengeti_sepcies,)) dbname = code + '_Serengeti' print('dbname = %r' % (dbname,)) dbdir = ut.ensuredir(join(ibeis.sysres.get_workdir(), dbname)) print('dbdir = %r' % (dbdir,)) image_dir = ut.ensuredir(join(dbdir, 'images')) base_url = 'http://datadryad.org/bitstream/handle/10255' all_images_url = base_url + '/dryad.86392/all_images.csv' consensus_metadata_url = base_url + '/dryad.86348/consensus_data.csv' search_effort_url = base_url + '/dryad.86347/search_effort.csv' gold_standard_url = base_url + '/dryad.76010/gold_standard_data.csv' all_images_fpath = ut.grab_file_url(all_images_url, download_dir=dbdir) consensus_metadata_fpath = ut.grab_file_url(consensus_metadata_url, download_dir=dbdir) search_effort_fpath = ut.grab_file_url(search_effort_url, download_dir=dbdir) gold_standard_fpath = ut.grab_file_url(gold_standard_url, download_dir=dbdir) print('all_images_fpath = %r' % (all_images_fpath,)) print('consensus_metadata_fpath = %r' % (consensus_metadata_fpath,)) print('search_effort_fpath = %r' % (search_effort_fpath,)) print('gold_standard_fpath = %r' % (gold_standard_fpath,)) def read_csv(csv_fpath): import utool as ut csv_text = ut.read_from(csv_fpath) csv_lines = csv_text.split('\n') print(ut.list_str(csv_lines[0:2])) csv_data = [[field.strip('"').strip('\r') for field in line.split(',')] for line in csv_lines if len(line) > 0] csv_header = csv_data[0] csv_data = csv_data[1:] return csv_data, csv_header def download_image_urls(image_url_info_list): # Find ones that we already have print('Requested %d downloaded images' % (len(image_url_info_list))) full_gpath_list = [join(image_dir, basename(gpath)) for gpath in image_url_info_list] exists_list = [ut.checkpath(gpath) for gpath in full_gpath_list] image_url_info_list_ = ut.compress(image_url_info_list, ut.not_list(exists_list)) print('Already have %d/%d downloaded images' % ( len(image_url_info_list) - len(image_url_info_list_), len(image_url_info_list))) print('Need to download %d images' % (len(image_url_info_list_))) #import sys #sys.exit(0) # Download the rest imgurl_prefix = 'https://snapshotserengeti.s3.msi.umn.edu/' image_url_list = [imgurl_prefix + suffix for suffix in image_url_info_list_] for img_url in ut.ProgressIter(image_url_list, lbl='Downloading image'): ut.grab_file_url(img_url, download_dir=image_dir) return full_gpath_list # Data contains information about which events have which animals if False: species_class_csv_data, species_class_header = read_csv(gold_standard_fpath) species_class_eventid_list = ut.get_list_column(species_class_csv_data, 0) #gold_num_species_annots_list = ut.get_list_column(gold_standard_csv_data, 2) species_class_species_list = ut.get_list_column(species_class_csv_data, 2) #gold_count_list = ut.get_list_column(gold_standard_csv_data, 3) else: species_class_csv_data, species_class_header = read_csv(consensus_metadata_fpath) species_class_eventid_list = ut.get_list_column(species_class_csv_data, 0) species_class_species_list = ut.get_list_column(species_class_csv_data, 7) # Find the zebra events serengeti_sepcies_set = sorted(list(set(species_class_species_list))) print('serengeti_sepcies_hist = %s' % ut.dict_str(ut.dict_hist(species_class_species_list), key_order_metric='val')) #print('serengeti_sepcies_set = %s' % (ut.list_str(serengeti_sepcies_set),)) assert serengeti_sepcies in serengeti_sepcies_set, 'not a known seregeti species' species_class_chosen_idx_list = ut.list_where( [serengeti_sepcies == species_ for species_ in species_class_species_list]) chosen_eventid_list = ut.take(species_class_eventid_list, species_class_chosen_idx_list) print('Number of chosen species:') print(' * len(species_class_chosen_idx_list) = %r' % (len(species_class_chosen_idx_list),)) print(' * len(chosen_eventid_list) = %r' % (len(chosen_eventid_list),)) # Read info about which events have which images images_csv_data, image_csv_header = read_csv(all_images_fpath) capture_event_id_list = ut.get_list_column(images_csv_data, 0) image_url_info_list = ut.get_list_column(images_csv_data, 1) # Group photos by eventid eventid_to_photos = ut.group_items(image_url_info_list, capture_event_id_list) # Filter to only chosens unflat_chosen_url_infos = ut.dict_take(eventid_to_photos, chosen_eventid_list) chosen_url_infos = ut.flatten(unflat_chosen_url_infos) image_url_info_list = chosen_url_infos chosen_path_list = download_image_urls(chosen_url_infos) ibs = ibeis.opendb(dbdir=dbdir, allow_newdir=True) gid_list_ = ibs.add_images(chosen_path_list, auto_localize=False) # NOQA #if False: # # remove non-zebra photos # from os.path import basename # base_gname_list = list(map(basename, zebra_url_infos)) # all_gname_list = ut.list_images(image_dir) # nonzebra_gname_list = ut.setdiff_ordered(all_gname_list, base_gname_list) # nonzebra_gpath_list = ut.fnames_to_fpaths(nonzebra_gname_list, image_dir) # ut.remove_fpaths(nonzebra_gpath_list) return ibs
[docs]def injest_main(): r""" CommandLine: python -m ibeis.dbio.ingest_database --test-injest_main python -m ibeis.dbio.ingest_database --test-injest_main --db snow-leopards Example: >>> # DISABLE_DOCTEST >>> from ibeis.dbio.ingest_database import * # NOQA >>> injest_main() """ print('__main__ = ingest_database.py') print(ut.unindent( ''' usage: python ibeis/ingest/ingest_database.py --db [dbname] Valid dbnames:''') + ut.indentjoin(STANDARD_INGEST_FUNCS.keys(), '\n * ')) dbname = ut.get_argval('--db', str, None) force_delete = ut.get_argflag(('--force_delete', '--force-delete')) ibs = ingest_standard_database(dbname, force_delete) # NOQA print('finished db injest') #img_dir = join(ibeis.sysres.get_workdir(), 'polar_bears') #main_locals = ibeis.main(dbdir=img_dir, gui=False) #ibs = main_locals['ibs'] #ingest_rawdata(ibs, img_dir)
if __name__ == '__main__': """ CommandLine: python ibeis/dbio/ingest_database.py --db testdb1 --serial --verbose --very-verbose python ibeis/dbio/ingest_database.py --db testdb1 --serial --verbose --very-verbose --super-strict --superstrict # NOQA python ibeis/dbio/ingest_database.py --db JAG_Kieryn --force-delete python ibeis/dbio/ingest_database.py --db polar_bears --force_delete python ibeis/dbio/ingest_database.py --db snails_drop1 python ibeis/dbio/ingest_database.py --db testdb1 python -m ibeis.dbio.ingest_database --test-injest_main --db Elephants_drop1 """ ut.inject_colored_exceptions() if ut.doctest_was_requested(): ut.doctest_funcs() else: injest_main() import multiprocessing multiprocessing.freeze_support() # win32