workflows.embedding_selection
1import logging 2import os 3import pickle 4import re 5import time 6from pathlib import Path 7 8import fiftyone as fo 9import fiftyone.brain as fob 10import fiftyone.zoo as foz 11from fiftyone import ViewField as F 12from huggingface_hub import HfApi, hf_hub_download 13from torch.utils.tensorboard import SummaryWriter 14from tqdm import tqdm 15 16from config.config import GLOBAL_SEED, HF_DO_UPLOAD, HF_ROOT, NUM_WORKERS 17from utils.sample_field_operations import add_sample_field 18 19BRAIN_TAXONOMY = { 20 "field": "embedding_selection", 21 "value_compute_representativeness": "representativeness_center", 22 "value_find_unique": "greedy_center", 23 "value_compute_uniqueness": "deterministic_center", 24 "value_find_unique_neighbour": "greedy_neighbour", 25 "value_compute_uniqueness_neighbour": "deterministic_neighbour", 26 "value_compute_representativeness_neighbour": "representativeness_neighbour", 27 "field_model": "embedding_selection_model", 28 "field_count": "embedding_selection_count", 29} 30 31 32class EmbeddingSelection: 33 """Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html.""" 34 35 def __init__( 36 self, 37 dataset, 38 dataset_info, 39 model_name, 40 log_dir, 41 embeddings_path="./output/embeddings/", 42 ): 43 """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection.""" 44 45 # WandB counter 46 self.steps = 0 47 self.dataset = dataset 48 self.brains = dataset.list_brain_runs() 49 self.dataset_name = dataset_info["name"] 50 self.v51_model_zoo = foz.list_zoo_models() 51 self.writer = SummaryWriter(log_dir=log_dir) 52 53 # Model 54 if model_name not in self.v51_model_zoo: 55 logging.warning( 56 "Model " + model_name + " is not part of the V51 model zoo." 57 ) 58 self.model = foz.load_zoo_model(model_name) 59 60 # Keys 61 self.model_name = model_name 62 self.model_name_key = re.sub(r"[\W-]+", "_", model_name) 63 self.embedding_key = "embedding_" + self.model_name_key 64 self.similiarity_key = "simil_" + self.model_name_key 65 self.uniqueness_key = "uniqueness_" + self.model_name_key 66 67 # Storing variables 68 self.embeddings_vis = {} # Multiple methods per model 69 self.representativeness = {} # Multiple methods per model 70 self.embeddings_model = None 71 self.similarities = None 72 73 # Generate folder to store all embedding-related results 74 self.embeddings_root = embeddings_path + self.dataset_name + "/" 75 Path(self.embeddings_root).mkdir(parents=True, exist_ok=True) 76 77 self.hf_repo_name = ( 78 f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}" 79 ) 80 81 # Add fields to dataset 82 add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField) 83 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField) 84 # Float instead of Int for visualization style in UI, color gradient instead of color palette 85 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField) 86 87 # Init count for samples only once. Is intilized with None by add_sample_field 88 test_sample = self.dataset.first() 89 if test_sample[BRAIN_TAXONOMY["field_count"]] is None: 90 logging.info("Setting all selection counts to 0") 91 zeros = [0] * len(self.dataset) # Needs to be an iterablr 92 self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True) 93 94 # Determine if model was already used for selection 95 self.model_already_used = False 96 dataset_schema = self.dataset.get_field_schema() 97 if BRAIN_TAXONOMY["field_model"] in dataset_schema: 98 field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"])) 99 if self.model_name_key in field_values: 100 self.model_already_used = True 101 102 def __del__(self): 103 """Destructor that decrements step counter and closes the writer.""" 104 self.steps -= 1 # +1 after every function, need to decrement for final step 105 self.writer.close() 106 107 def compute_embeddings(self, mode): 108 """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods.""" 109 start_time = time.time() 110 111 dim_reduction_methods = list(fob.brain_config.visualization_methods.keys()) 112 dim_reduction_methods.remove("manual") 113 114 embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl" 115 116 if self.model.has_embeddings: 117 # Try to load models 118 load_models_successful = None 119 if mode == "load": 120 try: 121 logging.info( 122 f"Attempting to load embeddings for model {self.model_name_key}." 123 ) 124 if self.dataset.get_field(self.embedding_key) is not None: 125 logging.info("Loading embeddings from V51.") 126 self.embeddings_model = self.dataset.values(self.embedding_key) 127 elif os.path.exists(embedding_file_name): 128 logging.info("Loading embeddings from disk.") 129 with open(embedding_file_name, "rb") as f: 130 self.embeddings_model = pickle.load(f) 131 self.dataset.set_values( 132 self.embedding_key, self.embeddings_model 133 ) 134 else: 135 logging.info( 136 f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}" 137 ) 138 model_name = f"{self.model_name_key}.pkl" 139 embedding_file_name = hf_hub_download( 140 repo_id=self.hf_repo_name, 141 filename=model_name, 142 local_dir=self.embeddings_root, 143 ) 144 logging.info("Loading embeddings from disk.") 145 with open(embedding_file_name, "rb") as f: 146 self.embeddings_model = pickle.load(f) 147 self.dataset.set_values( 148 self.embedding_key, self.embeddings_model 149 ) 150 load_models_successful = True 151 except Exception as e: 152 logging.warning(f"Failed to load or download embeddings: {str(e)}") 153 load_models_successful = False 154 155 if mode == "compute" or load_models_successful == False: 156 logging.info(f"Computing embeddings for model {self.model_name_key}.") 157 self.dataset.compute_embeddings( 158 model=self.model, embeddings_field=self.embedding_key 159 ) 160 self.embeddings_model = self.dataset.values(self.embedding_key) 161 162 self.dataset.set_values(self.embedding_key, self.embeddings_model) 163 with open(embedding_file_name, "wb") as f: 164 pickle.dump(self.embeddings_model, f) 165 166 # Upload embeddings to Hugging Face 167 if HF_DO_UPLOAD == True: 168 logging.info( 169 f"Uploading embeddings to Hugging Face: {self.hf_repo_name}" 170 ) 171 api = HfApi() 172 api.create_repo( 173 self.hf_repo_name, 174 private=True, 175 repo_type="model", 176 exist_ok=True, 177 ) 178 179 model_name = f"{self.model_name_key}.pkl" 180 api.upload_file( 181 path_or_fileobj=embedding_file_name, 182 path_in_repo=model_name, 183 repo_id=self.hf_repo_name, 184 repo_type="model", 185 ) 186 187 if mode not in ["load", "compute"]: 188 logging.error(f"Mode {mode} is not supported.") 189 190 for method in tqdm(dim_reduction_methods, "Dimensionality reductions"): 191 method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method) 192 points_key = "points_" + method_key 193 vis_file_name = self.embeddings_root + method_key + ".pkl" 194 195 if method_key in self.brains: 196 logging.info("Loading vis from V51.") 197 brain_info = self.dataset.get_brain_info(method_key) 198 self.embeddings_vis[method_key] = self.dataset.load_brain_results( 199 method_key 200 ) 201 202 elif os.path.exists(vis_file_name): 203 logging.info("Loading vis from disk.") 204 with open(vis_file_name, "rb") as f: 205 points = pickle.load(f) 206 207 self.embeddings_vis[method_key] = fob.compute_visualization( 208 self.dataset, 209 method=method, 210 points=points, 211 embeddings=self.embedding_key, 212 seed=GLOBAL_SEED, 213 brain_key=method_key, 214 num_workers=NUM_WORKERS, 215 ) 216 self.dataset.set_values( 217 points_key, self.embeddings_vis[method_key].current_points 218 ) 219 220 else: 221 logging.info("Computing vis.") 222 self.embeddings_vis[method_key] = fob.compute_visualization( 223 self.dataset, 224 method=method, 225 embeddings=self.embedding_key, 226 seed=GLOBAL_SEED, 227 brain_key=method_key, 228 num_workers=NUM_WORKERS, 229 ) 230 self.dataset.set_values( 231 points_key, self.embeddings_vis[method_key].current_points 232 ) 233 234 with open(vis_file_name, "wb") as f: 235 pickle.dump(self.embeddings_vis[method_key].current_points, f) 236 else: 237 logging.warning( 238 "Model " + self.model_name + " does not provide embeddings." 239 ) 240 end_time = time.time() 241 duration = end_time - start_time 242 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 243 self.steps += 1 244 245 def compute_similarity(self): 246 """Computes the similarity of embeddings for the dataset.""" 247 248 start_time = time.time() 249 if self.similiarity_key in self.brains: 250 logging.info("Loading similarities from V51.") 251 self.similarities = self.dataset.load_brain_results(self.similiarity_key) 252 253 else: 254 logging.info("Computing similarities.") 255 self.similarities = fob.compute_similarity( 256 self.dataset, 257 embeddings=self.embeddings_model, 258 brain_key=self.similiarity_key, 259 num_workers=NUM_WORKERS, 260 ) 261 end_time = time.time() 262 duration = end_time - start_time 263 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 264 self.steps += 1 265 266 def compute_representativeness(self, threshold): 267 """ 268 Computes the representativeness of frames in the dataset. 269 270 References: 271 - https://docs.voxel51.com/brain.html#image-representativeness 272 """ 273 274 start_time = time.time() 275 field = BRAIN_TAXONOMY["field"] 276 field_model = BRAIN_TAXONOMY["field_model"] 277 field_count = BRAIN_TAXONOMY["field_count"] 278 value = BRAIN_TAXONOMY["value_compute_representativeness"] 279 methods_cluster_center = ["cluster-center", "cluster-center-downweight"] 280 281 for method in tqdm(methods_cluster_center, desc="Representativeness"): 282 method_key = re.sub( 283 r"[\W-]+", 284 "_", 285 "representativeness_" + self.model_name + "_" + method, 286 ) 287 288 if method_key in self.brains: 289 self.representativeness[method_key] = self.dataset.load_brain_results( 290 method_key 291 ) 292 293 logging.info("Computing representativeness.") 294 fob.compute_representativeness( 295 self.dataset, 296 representativeness_field=method_key, 297 method=method, 298 embeddings=self.embeddings_model, 299 num_workers=NUM_WORKERS, 300 progress=True, 301 ) 302 303 # quant_threshold = self.dataset.quantiles(key, threshold) 304 # view = self.dataset.match(F(key) >= quant_threshold) 305 view = self.dataset.match(F(method_key) >= threshold) 306 for sample in view.iter_samples(progress=True, autosave=True): 307 if sample[field] is None: 308 sample[field] = value 309 sample[field_model] = self.model_name_key 310 sample[field_count] += 1 311 else: 312 sample[field_count] += 1 313 end_time = time.time() 314 duration = end_time - start_time 315 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 316 self.steps += 1 317 318 def compute_unique_images_greedy(self, perct_unique): 319 """ 320 Computes a subset of unique images from the dataset using a greedy algorithm. 321 322 References: 323 - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example 324 """ 325 326 start_time = time.time() 327 sample_count = len(self.dataset.view()) 328 num_of_unique = perct_unique * sample_count 329 field = BRAIN_TAXONOMY["field"] 330 field_model = BRAIN_TAXONOMY["field_model"] 331 field_count = BRAIN_TAXONOMY["field_count"] 332 value = BRAIN_TAXONOMY["value_find_unique"] 333 334 # Check if any sample has the label label_unique: 335 dataset_labels = self.dataset.count_sample_tags() 336 center_view = self.dataset.match(F(field) == value) 337 338 if field in dataset_labels and len(center_view) > 0: 339 logging.info("No unique images.") 340 pass 341 342 else: 343 self.similarities.find_unique(num_of_unique) 344 for unique_id in tqdm( 345 self.similarities.unique_ids, desc="Tagging unique images" 346 ): 347 sample = self.dataset[unique_id] 348 if sample[field] is None: 349 sample[field] = value 350 sample[field_model] = self.model_name_key 351 sample[field_count] += 1 352 else: 353 sample[field_count] += 1 354 sample.save() 355 356 end_time = time.time() 357 duration = end_time - start_time 358 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 359 self.steps += 1 360 361 def compute_unique_images_deterministic(self, threshold): 362 """ 363 Computes a deterministic uniqueness score for each sample in the dataset. 364 365 References: 366 - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness 367 """ 368 369 start_time = time.time() 370 field = BRAIN_TAXONOMY["field"] 371 field_model = BRAIN_TAXONOMY["field_model"] 372 field_count = BRAIN_TAXONOMY["field_count"] 373 value = BRAIN_TAXONOMY["value_compute_uniqueness"] 374 375 fob.compute_uniqueness( 376 self.dataset, 377 embeddings=self.embeddings_model, 378 uniqueness_field=self.uniqueness_key, 379 num_workers=NUM_WORKERS, 380 ) 381 382 # quant_threshold = self.dataset.quantiles(key, threshold) 383 # view = self.dataset.match(F(key) >= quant_threshold) 384 view = self.dataset.match(F(self.uniqueness_key) >= threshold) 385 for sample in view.iter_samples(progress=True, autosave=True): 386 if sample[field] is None: 387 sample[field] = value 388 sample[field_model] = self.model_name_key 389 sample[field_count] += 1 390 else: 391 sample[field_count] += 1 392 end_time = time.time() 393 duration = end_time - start_time 394 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 395 self.steps += 1 396 397 def compute_similar_images(self, dist_threshold, neighbour_count): 398 """Computes and assigns similar images based on a distance threshold and neighbour count.""" 399 start_time = time.time() 400 field = BRAIN_TAXONOMY["field"] 401 field_model = BRAIN_TAXONOMY["field_model"] 402 field_count = BRAIN_TAXONOMY["field_count"] 403 field_neighbour_distance = "distance" 404 405 value_find_unique = BRAIN_TAXONOMY["value_find_unique"] 406 value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"] 407 value_compute_representativeness = BRAIN_TAXONOMY[ 408 "value_compute_representativeness" 409 ] 410 411 value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"] 412 value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[ 413 "value_compute_uniqueness_neighbour" 414 ] 415 value_compute_representativeness_neighbour = BRAIN_TAXONOMY[ 416 "value_compute_representativeness_neighbour" 417 ] 418 419 # Check if samples have already assigned fields 420 dataset_labels = self.dataset.count_sample_tags() 421 neighbour_view_greedy = self.dataset.match( 422 F(field) == value_find_unique_neighbour 423 ) 424 neighbour_view_deterministic = self.dataset.match( 425 F(field) == value_compute_uniqueness_neighbour 426 ) 427 neighbour_view_representativeness = self.dataset.match( 428 F(field) == value_compute_representativeness_neighbour 429 ) 430 431 if field in dataset_labels and ( 432 len(neighbour_view_greedy) > 0 433 and len(neighbour_view_deterministic) > 0 434 and len(neighbour_view_representativeness) > 0 435 ): 436 pass 437 438 else: 439 unique_view_greedy = self.dataset.match(F(field) == value_find_unique) 440 unique_view_deterministic = self.dataset.match( 441 F(field) == value_compute_uniqueness 442 ) 443 unique_view_representativeness = self.dataset.match( 444 F(field) == value_compute_representativeness 445 ) 446 447 views_values = [ 448 (unique_view_greedy, value_find_unique_neighbour), 449 (unique_view_deterministic, value_compute_uniqueness_neighbour), 450 ( 451 unique_view_representativeness, 452 value_compute_representativeness_neighbour, 453 ), 454 ] 455 456 for unique_view, value in tqdm(views_values, desc="Tagging similar images"): 457 for sample in unique_view: 458 view = self.dataset.sort_by_similarity( 459 sample.id, 460 k=neighbour_count, 461 brain_key=self.similiarity_key, 462 dist_field=field_neighbour_distance, 463 ) 464 for sample_neighbour in view: 465 distance = sample_neighbour[field_neighbour_distance] 466 if distance < dist_threshold: 467 if sample_neighbour[field] is None: 468 sample_neighbour[field] = value 469 sample_neighbour[field_model] = self.model_name_key 470 sample_neighbour[field_count] += 1 471 else: 472 sample_neighbour[field_count] += 1 473 sample_neighbour.save() 474 475 end_time = time.time() 476 duration = end_time - start_time 477 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 478 self.steps += 1
33class EmbeddingSelection: 34 """Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html.""" 35 36 def __init__( 37 self, 38 dataset, 39 dataset_info, 40 model_name, 41 log_dir, 42 embeddings_path="./output/embeddings/", 43 ): 44 """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection.""" 45 46 # WandB counter 47 self.steps = 0 48 self.dataset = dataset 49 self.brains = dataset.list_brain_runs() 50 self.dataset_name = dataset_info["name"] 51 self.v51_model_zoo = foz.list_zoo_models() 52 self.writer = SummaryWriter(log_dir=log_dir) 53 54 # Model 55 if model_name not in self.v51_model_zoo: 56 logging.warning( 57 "Model " + model_name + " is not part of the V51 model zoo." 58 ) 59 self.model = foz.load_zoo_model(model_name) 60 61 # Keys 62 self.model_name = model_name 63 self.model_name_key = re.sub(r"[\W-]+", "_", model_name) 64 self.embedding_key = "embedding_" + self.model_name_key 65 self.similiarity_key = "simil_" + self.model_name_key 66 self.uniqueness_key = "uniqueness_" + self.model_name_key 67 68 # Storing variables 69 self.embeddings_vis = {} # Multiple methods per model 70 self.representativeness = {} # Multiple methods per model 71 self.embeddings_model = None 72 self.similarities = None 73 74 # Generate folder to store all embedding-related results 75 self.embeddings_root = embeddings_path + self.dataset_name + "/" 76 Path(self.embeddings_root).mkdir(parents=True, exist_ok=True) 77 78 self.hf_repo_name = ( 79 f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}" 80 ) 81 82 # Add fields to dataset 83 add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField) 84 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField) 85 # Float instead of Int for visualization style in UI, color gradient instead of color palette 86 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField) 87 88 # Init count for samples only once. Is intilized with None by add_sample_field 89 test_sample = self.dataset.first() 90 if test_sample[BRAIN_TAXONOMY["field_count"]] is None: 91 logging.info("Setting all selection counts to 0") 92 zeros = [0] * len(self.dataset) # Needs to be an iterablr 93 self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True) 94 95 # Determine if model was already used for selection 96 self.model_already_used = False 97 dataset_schema = self.dataset.get_field_schema() 98 if BRAIN_TAXONOMY["field_model"] in dataset_schema: 99 field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"])) 100 if self.model_name_key in field_values: 101 self.model_already_used = True 102 103 def __del__(self): 104 """Destructor that decrements step counter and closes the writer.""" 105 self.steps -= 1 # +1 after every function, need to decrement for final step 106 self.writer.close() 107 108 def compute_embeddings(self, mode): 109 """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods.""" 110 start_time = time.time() 111 112 dim_reduction_methods = list(fob.brain_config.visualization_methods.keys()) 113 dim_reduction_methods.remove("manual") 114 115 embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl" 116 117 if self.model.has_embeddings: 118 # Try to load models 119 load_models_successful = None 120 if mode == "load": 121 try: 122 logging.info( 123 f"Attempting to load embeddings for model {self.model_name_key}." 124 ) 125 if self.dataset.get_field(self.embedding_key) is not None: 126 logging.info("Loading embeddings from V51.") 127 self.embeddings_model = self.dataset.values(self.embedding_key) 128 elif os.path.exists(embedding_file_name): 129 logging.info("Loading embeddings from disk.") 130 with open(embedding_file_name, "rb") as f: 131 self.embeddings_model = pickle.load(f) 132 self.dataset.set_values( 133 self.embedding_key, self.embeddings_model 134 ) 135 else: 136 logging.info( 137 f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}" 138 ) 139 model_name = f"{self.model_name_key}.pkl" 140 embedding_file_name = hf_hub_download( 141 repo_id=self.hf_repo_name, 142 filename=model_name, 143 local_dir=self.embeddings_root, 144 ) 145 logging.info("Loading embeddings from disk.") 146 with open(embedding_file_name, "rb") as f: 147 self.embeddings_model = pickle.load(f) 148 self.dataset.set_values( 149 self.embedding_key, self.embeddings_model 150 ) 151 load_models_successful = True 152 except Exception as e: 153 logging.warning(f"Failed to load or download embeddings: {str(e)}") 154 load_models_successful = False 155 156 if mode == "compute" or load_models_successful == False: 157 logging.info(f"Computing embeddings for model {self.model_name_key}.") 158 self.dataset.compute_embeddings( 159 model=self.model, embeddings_field=self.embedding_key 160 ) 161 self.embeddings_model = self.dataset.values(self.embedding_key) 162 163 self.dataset.set_values(self.embedding_key, self.embeddings_model) 164 with open(embedding_file_name, "wb") as f: 165 pickle.dump(self.embeddings_model, f) 166 167 # Upload embeddings to Hugging Face 168 if HF_DO_UPLOAD == True: 169 logging.info( 170 f"Uploading embeddings to Hugging Face: {self.hf_repo_name}" 171 ) 172 api = HfApi() 173 api.create_repo( 174 self.hf_repo_name, 175 private=True, 176 repo_type="model", 177 exist_ok=True, 178 ) 179 180 model_name = f"{self.model_name_key}.pkl" 181 api.upload_file( 182 path_or_fileobj=embedding_file_name, 183 path_in_repo=model_name, 184 repo_id=self.hf_repo_name, 185 repo_type="model", 186 ) 187 188 if mode not in ["load", "compute"]: 189 logging.error(f"Mode {mode} is not supported.") 190 191 for method in tqdm(dim_reduction_methods, "Dimensionality reductions"): 192 method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method) 193 points_key = "points_" + method_key 194 vis_file_name = self.embeddings_root + method_key + ".pkl" 195 196 if method_key in self.brains: 197 logging.info("Loading vis from V51.") 198 brain_info = self.dataset.get_brain_info(method_key) 199 self.embeddings_vis[method_key] = self.dataset.load_brain_results( 200 method_key 201 ) 202 203 elif os.path.exists(vis_file_name): 204 logging.info("Loading vis from disk.") 205 with open(vis_file_name, "rb") as f: 206 points = pickle.load(f) 207 208 self.embeddings_vis[method_key] = fob.compute_visualization( 209 self.dataset, 210 method=method, 211 points=points, 212 embeddings=self.embedding_key, 213 seed=GLOBAL_SEED, 214 brain_key=method_key, 215 num_workers=NUM_WORKERS, 216 ) 217 self.dataset.set_values( 218 points_key, self.embeddings_vis[method_key].current_points 219 ) 220 221 else: 222 logging.info("Computing vis.") 223 self.embeddings_vis[method_key] = fob.compute_visualization( 224 self.dataset, 225 method=method, 226 embeddings=self.embedding_key, 227 seed=GLOBAL_SEED, 228 brain_key=method_key, 229 num_workers=NUM_WORKERS, 230 ) 231 self.dataset.set_values( 232 points_key, self.embeddings_vis[method_key].current_points 233 ) 234 235 with open(vis_file_name, "wb") as f: 236 pickle.dump(self.embeddings_vis[method_key].current_points, f) 237 else: 238 logging.warning( 239 "Model " + self.model_name + " does not provide embeddings." 240 ) 241 end_time = time.time() 242 duration = end_time - start_time 243 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 244 self.steps += 1 245 246 def compute_similarity(self): 247 """Computes the similarity of embeddings for the dataset.""" 248 249 start_time = time.time() 250 if self.similiarity_key in self.brains: 251 logging.info("Loading similarities from V51.") 252 self.similarities = self.dataset.load_brain_results(self.similiarity_key) 253 254 else: 255 logging.info("Computing similarities.") 256 self.similarities = fob.compute_similarity( 257 self.dataset, 258 embeddings=self.embeddings_model, 259 brain_key=self.similiarity_key, 260 num_workers=NUM_WORKERS, 261 ) 262 end_time = time.time() 263 duration = end_time - start_time 264 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 265 self.steps += 1 266 267 def compute_representativeness(self, threshold): 268 """ 269 Computes the representativeness of frames in the dataset. 270 271 References: 272 - https://docs.voxel51.com/brain.html#image-representativeness 273 """ 274 275 start_time = time.time() 276 field = BRAIN_TAXONOMY["field"] 277 field_model = BRAIN_TAXONOMY["field_model"] 278 field_count = BRAIN_TAXONOMY["field_count"] 279 value = BRAIN_TAXONOMY["value_compute_representativeness"] 280 methods_cluster_center = ["cluster-center", "cluster-center-downweight"] 281 282 for method in tqdm(methods_cluster_center, desc="Representativeness"): 283 method_key = re.sub( 284 r"[\W-]+", 285 "_", 286 "representativeness_" + self.model_name + "_" + method, 287 ) 288 289 if method_key in self.brains: 290 self.representativeness[method_key] = self.dataset.load_brain_results( 291 method_key 292 ) 293 294 logging.info("Computing representativeness.") 295 fob.compute_representativeness( 296 self.dataset, 297 representativeness_field=method_key, 298 method=method, 299 embeddings=self.embeddings_model, 300 num_workers=NUM_WORKERS, 301 progress=True, 302 ) 303 304 # quant_threshold = self.dataset.quantiles(key, threshold) 305 # view = self.dataset.match(F(key) >= quant_threshold) 306 view = self.dataset.match(F(method_key) >= threshold) 307 for sample in view.iter_samples(progress=True, autosave=True): 308 if sample[field] is None: 309 sample[field] = value 310 sample[field_model] = self.model_name_key 311 sample[field_count] += 1 312 else: 313 sample[field_count] += 1 314 end_time = time.time() 315 duration = end_time - start_time 316 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 317 self.steps += 1 318 319 def compute_unique_images_greedy(self, perct_unique): 320 """ 321 Computes a subset of unique images from the dataset using a greedy algorithm. 322 323 References: 324 - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example 325 """ 326 327 start_time = time.time() 328 sample_count = len(self.dataset.view()) 329 num_of_unique = perct_unique * sample_count 330 field = BRAIN_TAXONOMY["field"] 331 field_model = BRAIN_TAXONOMY["field_model"] 332 field_count = BRAIN_TAXONOMY["field_count"] 333 value = BRAIN_TAXONOMY["value_find_unique"] 334 335 # Check if any sample has the label label_unique: 336 dataset_labels = self.dataset.count_sample_tags() 337 center_view = self.dataset.match(F(field) == value) 338 339 if field in dataset_labels and len(center_view) > 0: 340 logging.info("No unique images.") 341 pass 342 343 else: 344 self.similarities.find_unique(num_of_unique) 345 for unique_id in tqdm( 346 self.similarities.unique_ids, desc="Tagging unique images" 347 ): 348 sample = self.dataset[unique_id] 349 if sample[field] is None: 350 sample[field] = value 351 sample[field_model] = self.model_name_key 352 sample[field_count] += 1 353 else: 354 sample[field_count] += 1 355 sample.save() 356 357 end_time = time.time() 358 duration = end_time - start_time 359 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 360 self.steps += 1 361 362 def compute_unique_images_deterministic(self, threshold): 363 """ 364 Computes a deterministic uniqueness score for each sample in the dataset. 365 366 References: 367 - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness 368 """ 369 370 start_time = time.time() 371 field = BRAIN_TAXONOMY["field"] 372 field_model = BRAIN_TAXONOMY["field_model"] 373 field_count = BRAIN_TAXONOMY["field_count"] 374 value = BRAIN_TAXONOMY["value_compute_uniqueness"] 375 376 fob.compute_uniqueness( 377 self.dataset, 378 embeddings=self.embeddings_model, 379 uniqueness_field=self.uniqueness_key, 380 num_workers=NUM_WORKERS, 381 ) 382 383 # quant_threshold = self.dataset.quantiles(key, threshold) 384 # view = self.dataset.match(F(key) >= quant_threshold) 385 view = self.dataset.match(F(self.uniqueness_key) >= threshold) 386 for sample in view.iter_samples(progress=True, autosave=True): 387 if sample[field] is None: 388 sample[field] = value 389 sample[field_model] = self.model_name_key 390 sample[field_count] += 1 391 else: 392 sample[field_count] += 1 393 end_time = time.time() 394 duration = end_time - start_time 395 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 396 self.steps += 1 397 398 def compute_similar_images(self, dist_threshold, neighbour_count): 399 """Computes and assigns similar images based on a distance threshold and neighbour count.""" 400 start_time = time.time() 401 field = BRAIN_TAXONOMY["field"] 402 field_model = BRAIN_TAXONOMY["field_model"] 403 field_count = BRAIN_TAXONOMY["field_count"] 404 field_neighbour_distance = "distance" 405 406 value_find_unique = BRAIN_TAXONOMY["value_find_unique"] 407 value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"] 408 value_compute_representativeness = BRAIN_TAXONOMY[ 409 "value_compute_representativeness" 410 ] 411 412 value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"] 413 value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[ 414 "value_compute_uniqueness_neighbour" 415 ] 416 value_compute_representativeness_neighbour = BRAIN_TAXONOMY[ 417 "value_compute_representativeness_neighbour" 418 ] 419 420 # Check if samples have already assigned fields 421 dataset_labels = self.dataset.count_sample_tags() 422 neighbour_view_greedy = self.dataset.match( 423 F(field) == value_find_unique_neighbour 424 ) 425 neighbour_view_deterministic = self.dataset.match( 426 F(field) == value_compute_uniqueness_neighbour 427 ) 428 neighbour_view_representativeness = self.dataset.match( 429 F(field) == value_compute_representativeness_neighbour 430 ) 431 432 if field in dataset_labels and ( 433 len(neighbour_view_greedy) > 0 434 and len(neighbour_view_deterministic) > 0 435 and len(neighbour_view_representativeness) > 0 436 ): 437 pass 438 439 else: 440 unique_view_greedy = self.dataset.match(F(field) == value_find_unique) 441 unique_view_deterministic = self.dataset.match( 442 F(field) == value_compute_uniqueness 443 ) 444 unique_view_representativeness = self.dataset.match( 445 F(field) == value_compute_representativeness 446 ) 447 448 views_values = [ 449 (unique_view_greedy, value_find_unique_neighbour), 450 (unique_view_deterministic, value_compute_uniqueness_neighbour), 451 ( 452 unique_view_representativeness, 453 value_compute_representativeness_neighbour, 454 ), 455 ] 456 457 for unique_view, value in tqdm(views_values, desc="Tagging similar images"): 458 for sample in unique_view: 459 view = self.dataset.sort_by_similarity( 460 sample.id, 461 k=neighbour_count, 462 brain_key=self.similiarity_key, 463 dist_field=field_neighbour_distance, 464 ) 465 for sample_neighbour in view: 466 distance = sample_neighbour[field_neighbour_distance] 467 if distance < dist_threshold: 468 if sample_neighbour[field] is None: 469 sample_neighbour[field] = value 470 sample_neighbour[field_model] = self.model_name_key 471 sample_neighbour[field_count] += 1 472 else: 473 sample_neighbour[field_count] += 1 474 sample_neighbour.save() 475 476 end_time = time.time() 477 duration = end_time - start_time 478 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 479 self.steps += 1
Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html.
36 def __init__( 37 self, 38 dataset, 39 dataset_info, 40 model_name, 41 log_dir, 42 embeddings_path="./output/embeddings/", 43 ): 44 """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection.""" 45 46 # WandB counter 47 self.steps = 0 48 self.dataset = dataset 49 self.brains = dataset.list_brain_runs() 50 self.dataset_name = dataset_info["name"] 51 self.v51_model_zoo = foz.list_zoo_models() 52 self.writer = SummaryWriter(log_dir=log_dir) 53 54 # Model 55 if model_name not in self.v51_model_zoo: 56 logging.warning( 57 "Model " + model_name + " is not part of the V51 model zoo." 58 ) 59 self.model = foz.load_zoo_model(model_name) 60 61 # Keys 62 self.model_name = model_name 63 self.model_name_key = re.sub(r"[\W-]+", "_", model_name) 64 self.embedding_key = "embedding_" + self.model_name_key 65 self.similiarity_key = "simil_" + self.model_name_key 66 self.uniqueness_key = "uniqueness_" + self.model_name_key 67 68 # Storing variables 69 self.embeddings_vis = {} # Multiple methods per model 70 self.representativeness = {} # Multiple methods per model 71 self.embeddings_model = None 72 self.similarities = None 73 74 # Generate folder to store all embedding-related results 75 self.embeddings_root = embeddings_path + self.dataset_name + "/" 76 Path(self.embeddings_root).mkdir(parents=True, exist_ok=True) 77 78 self.hf_repo_name = ( 79 f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}" 80 ) 81 82 # Add fields to dataset 83 add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField) 84 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField) 85 # Float instead of Int for visualization style in UI, color gradient instead of color palette 86 add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField) 87 88 # Init count for samples only once. Is intilized with None by add_sample_field 89 test_sample = self.dataset.first() 90 if test_sample[BRAIN_TAXONOMY["field_count"]] is None: 91 logging.info("Setting all selection counts to 0") 92 zeros = [0] * len(self.dataset) # Needs to be an iterablr 93 self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True) 94 95 # Determine if model was already used for selection 96 self.model_already_used = False 97 dataset_schema = self.dataset.get_field_schema() 98 if BRAIN_TAXONOMY["field_model"] in dataset_schema: 99 field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"])) 100 if self.model_name_key in field_values: 101 self.model_already_used = True
Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection.
108 def compute_embeddings(self, mode): 109 """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods.""" 110 start_time = time.time() 111 112 dim_reduction_methods = list(fob.brain_config.visualization_methods.keys()) 113 dim_reduction_methods.remove("manual") 114 115 embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl" 116 117 if self.model.has_embeddings: 118 # Try to load models 119 load_models_successful = None 120 if mode == "load": 121 try: 122 logging.info( 123 f"Attempting to load embeddings for model {self.model_name_key}." 124 ) 125 if self.dataset.get_field(self.embedding_key) is not None: 126 logging.info("Loading embeddings from V51.") 127 self.embeddings_model = self.dataset.values(self.embedding_key) 128 elif os.path.exists(embedding_file_name): 129 logging.info("Loading embeddings from disk.") 130 with open(embedding_file_name, "rb") as f: 131 self.embeddings_model = pickle.load(f) 132 self.dataset.set_values( 133 self.embedding_key, self.embeddings_model 134 ) 135 else: 136 logging.info( 137 f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}" 138 ) 139 model_name = f"{self.model_name_key}.pkl" 140 embedding_file_name = hf_hub_download( 141 repo_id=self.hf_repo_name, 142 filename=model_name, 143 local_dir=self.embeddings_root, 144 ) 145 logging.info("Loading embeddings from disk.") 146 with open(embedding_file_name, "rb") as f: 147 self.embeddings_model = pickle.load(f) 148 self.dataset.set_values( 149 self.embedding_key, self.embeddings_model 150 ) 151 load_models_successful = True 152 except Exception as e: 153 logging.warning(f"Failed to load or download embeddings: {str(e)}") 154 load_models_successful = False 155 156 if mode == "compute" or load_models_successful == False: 157 logging.info(f"Computing embeddings for model {self.model_name_key}.") 158 self.dataset.compute_embeddings( 159 model=self.model, embeddings_field=self.embedding_key 160 ) 161 self.embeddings_model = self.dataset.values(self.embedding_key) 162 163 self.dataset.set_values(self.embedding_key, self.embeddings_model) 164 with open(embedding_file_name, "wb") as f: 165 pickle.dump(self.embeddings_model, f) 166 167 # Upload embeddings to Hugging Face 168 if HF_DO_UPLOAD == True: 169 logging.info( 170 f"Uploading embeddings to Hugging Face: {self.hf_repo_name}" 171 ) 172 api = HfApi() 173 api.create_repo( 174 self.hf_repo_name, 175 private=True, 176 repo_type="model", 177 exist_ok=True, 178 ) 179 180 model_name = f"{self.model_name_key}.pkl" 181 api.upload_file( 182 path_or_fileobj=embedding_file_name, 183 path_in_repo=model_name, 184 repo_id=self.hf_repo_name, 185 repo_type="model", 186 ) 187 188 if mode not in ["load", "compute"]: 189 logging.error(f"Mode {mode} is not supported.") 190 191 for method in tqdm(dim_reduction_methods, "Dimensionality reductions"): 192 method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method) 193 points_key = "points_" + method_key 194 vis_file_name = self.embeddings_root + method_key + ".pkl" 195 196 if method_key in self.brains: 197 logging.info("Loading vis from V51.") 198 brain_info = self.dataset.get_brain_info(method_key) 199 self.embeddings_vis[method_key] = self.dataset.load_brain_results( 200 method_key 201 ) 202 203 elif os.path.exists(vis_file_name): 204 logging.info("Loading vis from disk.") 205 with open(vis_file_name, "rb") as f: 206 points = pickle.load(f) 207 208 self.embeddings_vis[method_key] = fob.compute_visualization( 209 self.dataset, 210 method=method, 211 points=points, 212 embeddings=self.embedding_key, 213 seed=GLOBAL_SEED, 214 brain_key=method_key, 215 num_workers=NUM_WORKERS, 216 ) 217 self.dataset.set_values( 218 points_key, self.embeddings_vis[method_key].current_points 219 ) 220 221 else: 222 logging.info("Computing vis.") 223 self.embeddings_vis[method_key] = fob.compute_visualization( 224 self.dataset, 225 method=method, 226 embeddings=self.embedding_key, 227 seed=GLOBAL_SEED, 228 brain_key=method_key, 229 num_workers=NUM_WORKERS, 230 ) 231 self.dataset.set_values( 232 points_key, self.embeddings_vis[method_key].current_points 233 ) 234 235 with open(vis_file_name, "wb") as f: 236 pickle.dump(self.embeddings_vis[method_key].current_points, f) 237 else: 238 logging.warning( 239 "Model " + self.model_name + " does not provide embeddings." 240 ) 241 end_time = time.time() 242 duration = end_time - start_time 243 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 244 self.steps += 1
Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods.
246 def compute_similarity(self): 247 """Computes the similarity of embeddings for the dataset.""" 248 249 start_time = time.time() 250 if self.similiarity_key in self.brains: 251 logging.info("Loading similarities from V51.") 252 self.similarities = self.dataset.load_brain_results(self.similiarity_key) 253 254 else: 255 logging.info("Computing similarities.") 256 self.similarities = fob.compute_similarity( 257 self.dataset, 258 embeddings=self.embeddings_model, 259 brain_key=self.similiarity_key, 260 num_workers=NUM_WORKERS, 261 ) 262 end_time = time.time() 263 duration = end_time - start_time 264 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 265 self.steps += 1
Computes the similarity of embeddings for the dataset.
267 def compute_representativeness(self, threshold): 268 """ 269 Computes the representativeness of frames in the dataset. 270 271 References: 272 - https://docs.voxel51.com/brain.html#image-representativeness 273 """ 274 275 start_time = time.time() 276 field = BRAIN_TAXONOMY["field"] 277 field_model = BRAIN_TAXONOMY["field_model"] 278 field_count = BRAIN_TAXONOMY["field_count"] 279 value = BRAIN_TAXONOMY["value_compute_representativeness"] 280 methods_cluster_center = ["cluster-center", "cluster-center-downweight"] 281 282 for method in tqdm(methods_cluster_center, desc="Representativeness"): 283 method_key = re.sub( 284 r"[\W-]+", 285 "_", 286 "representativeness_" + self.model_name + "_" + method, 287 ) 288 289 if method_key in self.brains: 290 self.representativeness[method_key] = self.dataset.load_brain_results( 291 method_key 292 ) 293 294 logging.info("Computing representativeness.") 295 fob.compute_representativeness( 296 self.dataset, 297 representativeness_field=method_key, 298 method=method, 299 embeddings=self.embeddings_model, 300 num_workers=NUM_WORKERS, 301 progress=True, 302 ) 303 304 # quant_threshold = self.dataset.quantiles(key, threshold) 305 # view = self.dataset.match(F(key) >= quant_threshold) 306 view = self.dataset.match(F(method_key) >= threshold) 307 for sample in view.iter_samples(progress=True, autosave=True): 308 if sample[field] is None: 309 sample[field] = value 310 sample[field_model] = self.model_name_key 311 sample[field_count] += 1 312 else: 313 sample[field_count] += 1 314 end_time = time.time() 315 duration = end_time - start_time 316 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 317 self.steps += 1
Computes the representativeness of frames in the dataset.
References: - https://docs.voxel51.com/brain.html#image-representativeness
319 def compute_unique_images_greedy(self, perct_unique): 320 """ 321 Computes a subset of unique images from the dataset using a greedy algorithm. 322 323 References: 324 - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example 325 """ 326 327 start_time = time.time() 328 sample_count = len(self.dataset.view()) 329 num_of_unique = perct_unique * sample_count 330 field = BRAIN_TAXONOMY["field"] 331 field_model = BRAIN_TAXONOMY["field_model"] 332 field_count = BRAIN_TAXONOMY["field_count"] 333 value = BRAIN_TAXONOMY["value_find_unique"] 334 335 # Check if any sample has the label label_unique: 336 dataset_labels = self.dataset.count_sample_tags() 337 center_view = self.dataset.match(F(field) == value) 338 339 if field in dataset_labels and len(center_view) > 0: 340 logging.info("No unique images.") 341 pass 342 343 else: 344 self.similarities.find_unique(num_of_unique) 345 for unique_id in tqdm( 346 self.similarities.unique_ids, desc="Tagging unique images" 347 ): 348 sample = self.dataset[unique_id] 349 if sample[field] is None: 350 sample[field] = value 351 sample[field_model] = self.model_name_key 352 sample[field_count] += 1 353 else: 354 sample[field_count] += 1 355 sample.save() 356 357 end_time = time.time() 358 duration = end_time - start_time 359 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 360 self.steps += 1
Computes a subset of unique images from the dataset using a greedy algorithm.
References: - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example
362 def compute_unique_images_deterministic(self, threshold): 363 """ 364 Computes a deterministic uniqueness score for each sample in the dataset. 365 366 References: 367 - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness 368 """ 369 370 start_time = time.time() 371 field = BRAIN_TAXONOMY["field"] 372 field_model = BRAIN_TAXONOMY["field_model"] 373 field_count = BRAIN_TAXONOMY["field_count"] 374 value = BRAIN_TAXONOMY["value_compute_uniqueness"] 375 376 fob.compute_uniqueness( 377 self.dataset, 378 embeddings=self.embeddings_model, 379 uniqueness_field=self.uniqueness_key, 380 num_workers=NUM_WORKERS, 381 ) 382 383 # quant_threshold = self.dataset.quantiles(key, threshold) 384 # view = self.dataset.match(F(key) >= quant_threshold) 385 view = self.dataset.match(F(self.uniqueness_key) >= threshold) 386 for sample in view.iter_samples(progress=True, autosave=True): 387 if sample[field] is None: 388 sample[field] = value 389 sample[field_model] = self.model_name_key 390 sample[field_count] += 1 391 else: 392 sample[field_count] += 1 393 end_time = time.time() 394 duration = end_time - start_time 395 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 396 self.steps += 1
Computes a deterministic uniqueness score for each sample in the dataset.
References: - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness
398 def compute_similar_images(self, dist_threshold, neighbour_count): 399 """Computes and assigns similar images based on a distance threshold and neighbour count.""" 400 start_time = time.time() 401 field = BRAIN_TAXONOMY["field"] 402 field_model = BRAIN_TAXONOMY["field_model"] 403 field_count = BRAIN_TAXONOMY["field_count"] 404 field_neighbour_distance = "distance" 405 406 value_find_unique = BRAIN_TAXONOMY["value_find_unique"] 407 value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"] 408 value_compute_representativeness = BRAIN_TAXONOMY[ 409 "value_compute_representativeness" 410 ] 411 412 value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"] 413 value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[ 414 "value_compute_uniqueness_neighbour" 415 ] 416 value_compute_representativeness_neighbour = BRAIN_TAXONOMY[ 417 "value_compute_representativeness_neighbour" 418 ] 419 420 # Check if samples have already assigned fields 421 dataset_labels = self.dataset.count_sample_tags() 422 neighbour_view_greedy = self.dataset.match( 423 F(field) == value_find_unique_neighbour 424 ) 425 neighbour_view_deterministic = self.dataset.match( 426 F(field) == value_compute_uniqueness_neighbour 427 ) 428 neighbour_view_representativeness = self.dataset.match( 429 F(field) == value_compute_representativeness_neighbour 430 ) 431 432 if field in dataset_labels and ( 433 len(neighbour_view_greedy) > 0 434 and len(neighbour_view_deterministic) > 0 435 and len(neighbour_view_representativeness) > 0 436 ): 437 pass 438 439 else: 440 unique_view_greedy = self.dataset.match(F(field) == value_find_unique) 441 unique_view_deterministic = self.dataset.match( 442 F(field) == value_compute_uniqueness 443 ) 444 unique_view_representativeness = self.dataset.match( 445 F(field) == value_compute_representativeness 446 ) 447 448 views_values = [ 449 (unique_view_greedy, value_find_unique_neighbour), 450 (unique_view_deterministic, value_compute_uniqueness_neighbour), 451 ( 452 unique_view_representativeness, 453 value_compute_representativeness_neighbour, 454 ), 455 ] 456 457 for unique_view, value in tqdm(views_values, desc="Tagging similar images"): 458 for sample in unique_view: 459 view = self.dataset.sort_by_similarity( 460 sample.id, 461 k=neighbour_count, 462 brain_key=self.similiarity_key, 463 dist_field=field_neighbour_distance, 464 ) 465 for sample_neighbour in view: 466 distance = sample_neighbour[field_neighbour_distance] 467 if distance < dist_threshold: 468 if sample_neighbour[field] is None: 469 sample_neighbour[field] = value 470 sample_neighbour[field_model] = self.model_name_key 471 sample_neighbour[field_count] += 1 472 else: 473 sample_neighbour[field_count] += 1 474 sample_neighbour.save() 475 476 end_time = time.time() 477 duration = end_time - start_time 478 self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps) 479 self.steps += 1
Computes and assigns similar images based on a distance threshold and neighbour count.