Source code for ibeis.scripts.getshark

#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function
import getopt
import sys
from xml.dom.minidom import parseString
from os.path import split, splitext, join, exists, dirname
import utool as ut


[docs]def shark_misc(): import ibeis ibs = ibeis.opendb('WS_ALL') aid_list = ibs.get_valid_aids() flag_list = ibs.get_annot_been_adjusted(aid_list) adjusted_aids = ut.compress(aid_list, flag_list) return adjusted_aids
[docs]def download_sharks(XMLdata, number): """ cd ~/work/WS_ALL python -m ibeis.scripts.getshark >>> from ibeis.scripts.getshark import * # NOQA >>> url = 'www.whaleshark.org/listImages.jsp' >>> XMLdata = ut.url_read(url) >>> number = None """ # Prepare the output directory for writing, if it doesn't exist output_dir = 'sharkimages' ut.ensuredir(output_dir) dom = parseString(XMLdata) # Download files if number: maxCount = min(number, len(dom.getElementsByTagName('img'))) else: maxCount = len(dom.getElementsByTagName('img')) parsed_info = dict( img_url_list=[], localid_list=[], nameid_list=[], orig_fname_list=[], new_fname_list=[], ) print('Preparing to fetch %i files...' % maxCount) for shark in dom.getElementsByTagName('shark'): localCount = 0 for imageset in shark.getElementsByTagName('imageset'): for img in imageset.getElementsByTagName('img'): localCount += 1 img_url = img.getAttribute('href') orig_fname = split(img_url)[1] ext = splitext(orig_fname)[1].lower() nameid = shark.getAttribute('number') new_fname = '%s-%i%s' % ( nameid, localCount, ext) parsed_info['img_url_list'].append(img_url) parsed_info['nameid_list'].append(nameid) parsed_info['localid_list'].append(localCount) parsed_info['orig_fname_list'].append(orig_fname) parsed_info['new_fname_list'].append(new_fname) print('Parsed %i / %i files.' % (len(parsed_info['orig_fname_list']), maxCount)) if number is not None and len(parsed_info['orig_fname_list']) == number: break parsed_info['new_fpath_list'] = [join(output_dir, _fname) for _fname in parsed_info['new_fname_list']] print('Filtering parsed images') # Filter based on image type (keep only jpgs) ext_flags = [_fname.endswith('.jpg') or _fname.endswith('.jpg') for _fname in parsed_info['new_fname_list']] parsed_info = {key: ut.compress(list_, ext_flags) for key, list_ in parsed_info.items()} # Filter to only images matching the appropriate tags from ibeis import tag_funcs parsed_info['tags_list'] = parse_shark_tags(parsed_info['orig_fname_list']) tag_flags = tag_funcs.filterflags_general_tags( parsed_info['tags_list'], has_any=['view-left'], none_match=['qual.*', 'view-top', 'part-.*', 'cropped'], ) parsed_info = {key: ut.compress(list_, tag_flags) for key, list_ in parsed_info.items()} print('Tags in chosen images:') print(ut.dict_hist(ut.flatten(parsed_info['tags_list'] ))) # Download selected subset print('Downloading selected subset') _iter = list(zip(parsed_info['img_url_list'], parsed_info['new_fpath_list'])) _iter = ut.ProgressIter(_iter, lbl='downloading sharks') for img_url, new_fpath in _iter: if not exists(new_fpath): ut.download_url(img_url, new_fpath) # Remove corrupted or ill-formatted images print('Checking for corrupted images') import vtool as vt noncorrupt_flags = vt.filterflags_valid_images(parsed_info['new_fpath_list']) parsed_info = { key: ut.compress(list_, noncorrupt_flags) for key, list_ in parsed_info.items() } print('Removing small images') import numpy as np imgsize_list = np.array([vt.open_image_size(gpath) for gpath in parsed_info['new_fpath_list']]) sqrt_area_list = np.sqrt(np.prod(imgsize_list, axis=1)) areq_flags_list = sqrt_area_list >= 750 parsed_info = {key: ut.compress(list_, areq_flags_list) for key, list_ in parsed_info.items()} grouped_idxs = ut.group_items(list(range(len(parsed_info['nameid_list']))), parsed_info['nameid_list']) keep_idxs = sorted(ut.flatten([idxs for key, idxs in grouped_idxs.items() if len(idxs) >= 2])) parsed_info = {key: ut.take(list_, keep_idxs) for key, list_ in parsed_info.items()} print('Moving imagse to secondary directory') named_outputdir = 'named-left-sharkimages' # Build names parsed_info['namedir_fpath_list'] = [ join(named_outputdir, _nameid, _fname) for _fname, _nameid in zip(parsed_info['new_fname_list'], parsed_info['nameid_list'])] # Create directories ut.ensuredir(named_outputdir) named_dirs = ut.unique_ordered(list(map(dirname, parsed_info['namedir_fpath_list']))) for dir_ in named_dirs: ut.ensuredir(dir_) # Copy ut.copy_files_to(src_fpath_list=parsed_info['new_fpath_list'], dst_fpath_list=parsed_info['namedir_fpath_list'])
[docs]def parse_shark_tags(orig_fname_list): import re invalid_tag_patterns = [ re.escape('-'), re.escape('(') + '?\\d*' + re.escape(')') + '?', '\\d+-\\d+-\\d+', '\\d+,', '\\d+', 'vi*', 'i*v', 'i+', '\\d+th', '\\d+nd', '\\d+rd', 'remant', 'timnfe', 't', 'e', 'sjl', 'disc', 'dec', 'road', 'easter', 'western', 'west', 'tn', '\\d*ap', 'whaleshark\\d*', 'shark\\d*', 'whale\\d*', 'whalesharking', 'sharking', 'whalesharks', 'whales', 'picture', 'australien', 'australia', 'nick', 'tim\\d*', 'imageset', 'holiday', 'visit', 'tour', 'trip', 'pec', 'sv', 'a', 'b', 'gender', 'sex', 'img', 'image', 'pic', 'pics', 'leith', 'trips', 'kings', 'photo', 'video', 'media', 'fix', 'feeding', 'nrd', 'nd', 'gen', 'wa', 'nmp', 'bo', 'kd', 'ow', 'ne', 'dsc', 'nwd', 'mg', 'w', 'mai', 'blue', 'stumpy', 'oea', 'cbe', 'edc', 'knrt', 'tiws2', 'ando', 'adv', 'str', 'adventure', 'camera', 'tag', 'id', 'of', 'and', 'tagged', 'from', 'day', '\\d*april', '\\d*may', '\\d*july', '\\d*june', 'ningaloo', 'ningblue\\d*', 'kooling', ] valid_tag_level_set = [ ['view-left', 'left', 'lhs', 'l', 'leftside'], ['view-right', 'right', 'rhs', 'r', 'rightside'], ['view-back', 'back'], ['view-top', 'top'], ['sex-male', 'male', 'm', 'sexm'], ['sex-female', 'female', 'f'], ['sex-unknown', 'unknown', 'u'], ['part-tail', 'tail'], ['part-flank', 'side', 'flank'], ['part-head', 'head'], ['part-pectoral', 'pectoral', 'pec'], ['part-dorsal', 'dorsal', 'dorsals'], ['part-claspers', 'claspers', 'clasper'], ['part-fin', 'fin'], ['cropped', 'crop'], ['scar', 'scar2'], ['notch'], ['small'], ['bite'], ['cam-slr2', 'slr2'], #['cam-5m', '5m'] ['5m'], ['7m'], ['4m'], ['copy'], ['qual-resize'], ['qual-stretched'], ] def apply_enum_regex(pat_list): enum_endings = [ '[a-g]', '\\d*', 'i*', ] expanded_pats = ut.flatten([ [pat + end for end in enum_endings] for pat in pat_list ]) return expanded_pats def apply_regex_endings(pat_list): return [p + '$' for p in pat_list] tag_alias_map = {} for level_set in valid_tag_level_set: main_key = level_set[0] for key in level_set: tag_alias_map[key] = main_key inverse_alias_map = {} for level_set in valid_tag_level_set: inverse_alias_map[level_set[0]] = level_set regex_alias_map = { 'view-left': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-left'])), 'view-right': apply_regex_endings(apply_enum_regex(inverse_alias_map['view-right'])), } valid_tags = list(inverse_alias_map.keys()) invalid_tag_patterns = apply_regex_endings(invalid_tag_patterns) def parse_all_fname_tags(fname): _tags = [splitext(fname)[0]] _tags = ut.flatten([t.split('_') for t in _tags]) _tags = ut.flatten([t.split('.') for t in _tags]) _tags = [t.lower() for t in _tags] _tags = [tag_alias_map.get(t, t) for t in _tags] for key, vals in regex_alias_map.items(): pat = ut.regex_or(vals) _tags = [key if re.match(pat, t) else t for t in _tags] pat = ut.regex_or(invalid_tag_patterns) _tags = [t for t in _tags if not re.match(pat, t)] _tags = ut.unique_ordered(_tags) return _tags all_img_tag_list = list(map(parse_all_fname_tags, orig_fname_list)) known_img_tag_list = [list(set(tags).intersection(set(valid_tags))) for tags in all_img_tag_list] if False: # Help figure out which tags are important _parsed_tags = ut.flatten(all_img_tag_list) taghist = ut.dict_hist(_parsed_tags) taghist = {key: val for key, val in taghist.items() if val > 1} unknown_taghist = sorted([ (val, key) for key, val in taghist.items() if key not in valid_tags ])[::-1] known_taghist = sorted([ (val, key) for key, val in taghist.items() if key in valid_tags ])[::-1] print('Known') print(ut.list_str(known_taghist[0:100])) print('Unknown') print(ut.list_str(unknown_taghist[0:100])) print(ut.dict_str( ut.dict_hist(ut.flatten(known_img_tag_list)), key_order_metric='val' )) return known_img_tag_list
[docs]def main(): try: opts, args = getopt.getopt(sys.argv[1:], 'f:u:n:h') except getopt.GetoptError: usage() sys.exit(1) filename = None url = 'www.whaleshark.org/listImages.jsp' number = 0 # Handle command-line arguments for opt, arg in opts: if opt == '-h': usage() sys.exit() elif opt == '-f': filename = arg elif opt == '-u': url = arg elif opt == '-n': try: number = int(arg) except ValueError: usage() sys.exit() # Open the XML file and extract its contents as a DOM object if filename: XMLdata = ut.readfrom(filename) else: XMLdata = ut.url_read(url) #with open('XMLData.xml', 'w') as file_: # file_.write(XMLdata) print('Downloading') download_sharks(XMLdata, number)
[docs]def usage(): print('Fetches a number of images from the ECOCEAN shark database.') print('Options:') print(' -f <FILENAME> - Reads XML data from a file, rather than a URL.') print(' -u <URL> - Reads XML data from the given URL.') print(' -n <NUMBER> - Number of images to read; if omitted, reads all of them.') print(' -h - Prints this help text.')
if __name__ == '__main__': main()