workflows.embedding_selection

  1import logging
  2import os
  3import pickle
  4import re
  5import time
  6from pathlib import Path
  7
  8import fiftyone as fo
  9import fiftyone.brain as fob
 10import fiftyone.zoo as foz
 11from fiftyone import ViewField as F
 12from huggingface_hub import HfApi, hf_hub_download
 13from torch.utils.tensorboard import SummaryWriter
 14from tqdm import tqdm
 15
 16from config.config import GLOBAL_SEED, HF_DO_UPLOAD, HF_ROOT, NUM_WORKERS
 17from utils.sample_field_operations import add_sample_field
 18
 19BRAIN_TAXONOMY = {
 20    "field": "embedding_selection",
 21    "value_compute_representativeness": "representativeness_center",
 22    "value_find_unique": "greedy_center",
 23    "value_compute_uniqueness": "deterministic_center",
 24    "value_find_unique_neighbour": "greedy_neighbour",
 25    "value_compute_uniqueness_neighbour": "deterministic_neighbour",
 26    "value_compute_representativeness_neighbour": "representativeness_neighbour",
 27    "field_model": "embedding_selection_model",
 28    "field_count": "embedding_selection_count",
 29}
 30
 31
 32class EmbeddingSelection:
 33    """Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html."""
 34
 35    def __init__(
 36        self,
 37        dataset,
 38        dataset_info,
 39        model_name,
 40        log_dir,
 41        embeddings_path="./output/embeddings/",
 42    ):
 43        """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection."""
 44
 45        # WandB counter
 46        self.steps = 0
 47        self.dataset = dataset
 48        self.brains = dataset.list_brain_runs()
 49        self.dataset_name = dataset_info["name"]
 50        self.v51_model_zoo = foz.list_zoo_models()
 51        self.writer = SummaryWriter(log_dir=log_dir)
 52
 53        # Model
 54        if model_name not in self.v51_model_zoo:
 55            logging.warning(
 56                "Model " + model_name + " is not part of the V51 model zoo."
 57            )
 58        self.model = foz.load_zoo_model(model_name)
 59
 60        # Keys
 61        self.model_name = model_name
 62        self.model_name_key = re.sub(r"[\W-]+", "_", model_name)
 63        self.embedding_key = "embedding_" + self.model_name_key
 64        self.similiarity_key = "simil_" + self.model_name_key
 65        self.uniqueness_key = "uniqueness_" + self.model_name_key
 66
 67        # Storing variables
 68        self.embeddings_vis = {}  # Multiple methods per model
 69        self.representativeness = {}  # Multiple methods per model
 70        self.embeddings_model = None
 71        self.similarities = None
 72
 73        # Generate folder to store all embedding-related results
 74        self.embeddings_root = embeddings_path + self.dataset_name + "/"
 75        Path(self.embeddings_root).mkdir(parents=True, exist_ok=True)
 76
 77        self.hf_repo_name = (
 78            f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}"
 79        )
 80
 81        # Add fields to dataset
 82        add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField)
 83        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField)
 84        # Float instead of Int for visualization style in UI, color gradient instead of color palette
 85        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField)
 86
 87        # Init count for samples only once. Is intilized with None by add_sample_field
 88        test_sample = self.dataset.first()
 89        if test_sample[BRAIN_TAXONOMY["field_count"]] is None:
 90            logging.info("Setting all selection counts to 0")
 91            zeros = [0] * len(self.dataset)  # Needs to be an iterablr
 92            self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True)
 93
 94        # Determine if model was already used for selection
 95        self.model_already_used = False
 96        dataset_schema = self.dataset.get_field_schema()
 97        if BRAIN_TAXONOMY["field_model"] in dataset_schema:
 98            field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"]))
 99            if self.model_name_key in field_values:
100                self.model_already_used = True
101
102    def __del__(self):
103        """Destructor that decrements step counter and closes the writer."""
104        self.steps -= 1  # +1 after every function, need to decrement for final step
105        self.writer.close()
106
107    def compute_embeddings(self, mode):
108        """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods."""
109        start_time = time.time()
110
111        dim_reduction_methods = list(fob.brain_config.visualization_methods.keys())
112        dim_reduction_methods.remove("manual")
113
114        embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl"
115
116        if self.model.has_embeddings:
117            # Try to load models
118            load_models_successful = None
119            if mode == "load":
120                try:
121                    logging.info(
122                        f"Attempting to load embeddings for model {self.model_name_key}."
123                    )
124                    if self.dataset.get_field(self.embedding_key) is not None:
125                        logging.info("Loading embeddings from V51.")
126                        self.embeddings_model = self.dataset.values(self.embedding_key)
127                    elif os.path.exists(embedding_file_name):
128                        logging.info("Loading embeddings from disk.")
129                        with open(embedding_file_name, "rb") as f:
130                            self.embeddings_model = pickle.load(f)
131                        self.dataset.set_values(
132                            self.embedding_key, self.embeddings_model
133                        )
134                    else:
135                        logging.info(
136                            f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}"
137                        )
138                        model_name = f"{self.model_name_key}.pkl"
139                        embedding_file_name = hf_hub_download(
140                            repo_id=self.hf_repo_name,
141                            filename=model_name,
142                            local_dir=self.embeddings_root,
143                        )
144                        logging.info("Loading embeddings from disk.")
145                        with open(embedding_file_name, "rb") as f:
146                            self.embeddings_model = pickle.load(f)
147                        self.dataset.set_values(
148                            self.embedding_key, self.embeddings_model
149                        )
150                    load_models_successful = True
151                except Exception as e:
152                    logging.warning(f"Failed to load or download embeddings: {str(e)}")
153                    load_models_successful = False
154
155            if mode == "compute" or load_models_successful == False:
156                logging.info(f"Computing embeddings for model {self.model_name_key}.")
157                self.dataset.compute_embeddings(
158                    model=self.model, embeddings_field=self.embedding_key
159                )
160                self.embeddings_model = self.dataset.values(self.embedding_key)
161
162                self.dataset.set_values(self.embedding_key, self.embeddings_model)
163                with open(embedding_file_name, "wb") as f:
164                    pickle.dump(self.embeddings_model, f)
165
166                # Upload embeddings to Hugging Face
167                if HF_DO_UPLOAD == True:
168                    logging.info(
169                        f"Uploading embeddings to Hugging Face: {self.hf_repo_name}"
170                    )
171                    api = HfApi()
172                    api.create_repo(
173                        self.hf_repo_name,
174                        private=True,
175                        repo_type="model",
176                        exist_ok=True,
177                    )
178
179                    model_name = f"{self.model_name_key}.pkl"
180                    api.upload_file(
181                        path_or_fileobj=embedding_file_name,
182                        path_in_repo=model_name,
183                        repo_id=self.hf_repo_name,
184                        repo_type="model",
185                    )
186
187            if mode not in ["load", "compute"]:
188                logging.error(f"Mode {mode} is not supported.")
189
190            for method in tqdm(dim_reduction_methods, "Dimensionality reductions"):
191                method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method)
192                points_key = "points_" + method_key
193                vis_file_name = self.embeddings_root + method_key + ".pkl"
194
195                if method_key in self.brains:
196                    logging.info("Loading vis from V51.")
197                    brain_info = self.dataset.get_brain_info(method_key)
198                    self.embeddings_vis[method_key] = self.dataset.load_brain_results(
199                        method_key
200                    )
201
202                elif os.path.exists(vis_file_name):
203                    logging.info("Loading vis from disk.")
204                    with open(vis_file_name, "rb") as f:
205                        points = pickle.load(f)
206
207                    self.embeddings_vis[method_key] = fob.compute_visualization(
208                        self.dataset,
209                        method=method,
210                        points=points,
211                        embeddings=self.embedding_key,
212                        seed=GLOBAL_SEED,
213                        brain_key=method_key,
214                        num_workers=NUM_WORKERS,
215                    )
216                    self.dataset.set_values(
217                        points_key, self.embeddings_vis[method_key].current_points
218                    )
219
220                else:
221                    logging.info("Computing vis.")
222                    self.embeddings_vis[method_key] = fob.compute_visualization(
223                        self.dataset,
224                        method=method,
225                        embeddings=self.embedding_key,
226                        seed=GLOBAL_SEED,
227                        brain_key=method_key,
228                        num_workers=NUM_WORKERS,
229                    )
230                    self.dataset.set_values(
231                        points_key, self.embeddings_vis[method_key].current_points
232                    )
233
234                    with open(vis_file_name, "wb") as f:
235                        pickle.dump(self.embeddings_vis[method_key].current_points, f)
236        else:
237            logging.warning(
238                "Model " + self.model_name + " does not provide embeddings."
239            )
240        end_time = time.time()
241        duration = end_time - start_time
242        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
243        self.steps += 1
244
245    def compute_similarity(self):
246        """Computes the similarity of embeddings for the dataset."""
247
248        start_time = time.time()
249        if self.similiarity_key in self.brains:
250            logging.info("Loading similarities from V51.")
251            self.similarities = self.dataset.load_brain_results(self.similiarity_key)
252
253        else:
254            logging.info("Computing similarities.")
255            self.similarities = fob.compute_similarity(
256                self.dataset,
257                embeddings=self.embeddings_model,
258                brain_key=self.similiarity_key,
259                num_workers=NUM_WORKERS,
260            )
261        end_time = time.time()
262        duration = end_time - start_time
263        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
264        self.steps += 1
265
266    def compute_representativeness(self, threshold):
267        """
268        Computes the representativeness of frames in the dataset.
269
270        References:
271            - https://docs.voxel51.com/brain.html#image-representativeness
272        """
273
274        start_time = time.time()
275        field = BRAIN_TAXONOMY["field"]
276        field_model = BRAIN_TAXONOMY["field_model"]
277        field_count = BRAIN_TAXONOMY["field_count"]
278        value = BRAIN_TAXONOMY["value_compute_representativeness"]
279        methods_cluster_center = ["cluster-center", "cluster-center-downweight"]
280
281        for method in tqdm(methods_cluster_center, desc="Representativeness"):
282            method_key = re.sub(
283                r"[\W-]+",
284                "_",
285                "representativeness_" + self.model_name + "_" + method,
286            )
287
288            if method_key in self.brains:
289                self.representativeness[method_key] = self.dataset.load_brain_results(
290                    method_key
291                )
292
293            logging.info("Computing representativeness.")
294            fob.compute_representativeness(
295                self.dataset,
296                representativeness_field=method_key,
297                method=method,
298                embeddings=self.embeddings_model,
299                num_workers=NUM_WORKERS,
300                progress=True,
301            )
302
303            # quant_threshold = self.dataset.quantiles(key, threshold)
304            # view = self.dataset.match(F(key) >= quant_threshold)
305            view = self.dataset.match(F(method_key) >= threshold)
306            for sample in view.iter_samples(progress=True, autosave=True):
307                if sample[field] is None:
308                    sample[field] = value
309                    sample[field_model] = self.model_name_key
310                    sample[field_count] += 1
311                else:
312                    sample[field_count] += 1
313        end_time = time.time()
314        duration = end_time - start_time
315        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
316        self.steps += 1
317
318    def compute_unique_images_greedy(self, perct_unique):
319        """
320        Computes a subset of unique images from the dataset using a greedy algorithm.
321
322        References:
323            - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example
324        """
325
326        start_time = time.time()
327        sample_count = len(self.dataset.view())
328        num_of_unique = perct_unique * sample_count
329        field = BRAIN_TAXONOMY["field"]
330        field_model = BRAIN_TAXONOMY["field_model"]
331        field_count = BRAIN_TAXONOMY["field_count"]
332        value = BRAIN_TAXONOMY["value_find_unique"]
333
334        # Check if any sample has the label label_unique:
335        dataset_labels = self.dataset.count_sample_tags()
336        center_view = self.dataset.match(F(field) == value)
337
338        if field in dataset_labels and len(center_view) > 0:
339            logging.info("No unique images.")
340            pass
341
342        else:
343            self.similarities.find_unique(num_of_unique)
344            for unique_id in tqdm(
345                self.similarities.unique_ids, desc="Tagging unique images"
346            ):
347                sample = self.dataset[unique_id]
348                if sample[field] is None:
349                    sample[field] = value
350                    sample[field_model] = self.model_name_key
351                    sample[field_count] += 1
352                else:
353                    sample[field_count] += 1
354                sample.save()
355
356        end_time = time.time()
357        duration = end_time - start_time
358        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
359        self.steps += 1
360
361    def compute_unique_images_deterministic(self, threshold):
362        """
363        Computes a deterministic uniqueness score for each sample in the dataset.
364
365        References:
366            - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness
367        """
368
369        start_time = time.time()
370        field = BRAIN_TAXONOMY["field"]
371        field_model = BRAIN_TAXONOMY["field_model"]
372        field_count = BRAIN_TAXONOMY["field_count"]
373        value = BRAIN_TAXONOMY["value_compute_uniqueness"]
374
375        fob.compute_uniqueness(
376            self.dataset,
377            embeddings=self.embeddings_model,
378            uniqueness_field=self.uniqueness_key,
379            num_workers=NUM_WORKERS,
380        )
381
382        # quant_threshold = self.dataset.quantiles(key, threshold)
383        # view = self.dataset.match(F(key) >= quant_threshold)
384        view = self.dataset.match(F(self.uniqueness_key) >= threshold)
385        for sample in view.iter_samples(progress=True, autosave=True):
386            if sample[field] is None:
387                sample[field] = value
388                sample[field_model] = self.model_name_key
389                sample[field_count] += 1
390            else:
391                sample[field_count] += 1
392        end_time = time.time()
393        duration = end_time - start_time
394        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
395        self.steps += 1
396
397    def compute_similar_images(self, dist_threshold, neighbour_count):
398        """Computes and assigns similar images based on a distance threshold and neighbour count."""
399        start_time = time.time()
400        field = BRAIN_TAXONOMY["field"]
401        field_model = BRAIN_TAXONOMY["field_model"]
402        field_count = BRAIN_TAXONOMY["field_count"]
403        field_neighbour_distance = "distance"
404
405        value_find_unique = BRAIN_TAXONOMY["value_find_unique"]
406        value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"]
407        value_compute_representativeness = BRAIN_TAXONOMY[
408            "value_compute_representativeness"
409        ]
410
411        value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"]
412        value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[
413            "value_compute_uniqueness_neighbour"
414        ]
415        value_compute_representativeness_neighbour = BRAIN_TAXONOMY[
416            "value_compute_representativeness_neighbour"
417        ]
418
419        # Check if samples have already assigned fields
420        dataset_labels = self.dataset.count_sample_tags()
421        neighbour_view_greedy = self.dataset.match(
422            F(field) == value_find_unique_neighbour
423        )
424        neighbour_view_deterministic = self.dataset.match(
425            F(field) == value_compute_uniqueness_neighbour
426        )
427        neighbour_view_representativeness = self.dataset.match(
428            F(field) == value_compute_representativeness_neighbour
429        )
430
431        if field in dataset_labels and (
432            len(neighbour_view_greedy) > 0
433            and len(neighbour_view_deterministic) > 0
434            and len(neighbour_view_representativeness) > 0
435        ):
436            pass
437
438        else:
439            unique_view_greedy = self.dataset.match(F(field) == value_find_unique)
440            unique_view_deterministic = self.dataset.match(
441                F(field) == value_compute_uniqueness
442            )
443            unique_view_representativeness = self.dataset.match(
444                F(field) == value_compute_representativeness
445            )
446
447            views_values = [
448                (unique_view_greedy, value_find_unique_neighbour),
449                (unique_view_deterministic, value_compute_uniqueness_neighbour),
450                (
451                    unique_view_representativeness,
452                    value_compute_representativeness_neighbour,
453                ),
454            ]
455
456            for unique_view, value in tqdm(views_values, desc="Tagging similar images"):
457                for sample in unique_view:
458                    view = self.dataset.sort_by_similarity(
459                        sample.id,
460                        k=neighbour_count,
461                        brain_key=self.similiarity_key,
462                        dist_field=field_neighbour_distance,
463                    )
464                    for sample_neighbour in view:
465                        distance = sample_neighbour[field_neighbour_distance]
466                        if distance < dist_threshold:
467                            if sample_neighbour[field] is None:
468                                sample_neighbour[field] = value
469                                sample_neighbour[field_model] = self.model_name_key
470                                sample_neighbour[field_count] += 1
471                            else:
472                                sample_neighbour[field_count] += 1
473                            sample_neighbour.save()
474
475        end_time = time.time()
476        duration = end_time - start_time
477        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
478        self.steps += 1
BRAIN_TAXONOMY = {'field': 'embedding_selection', 'value_compute_representativeness': 'representativeness_center', 'value_find_unique': 'greedy_center', 'value_compute_uniqueness': 'deterministic_center', 'value_find_unique_neighbour': 'greedy_neighbour', 'value_compute_uniqueness_neighbour': 'deterministic_neighbour', 'value_compute_representativeness_neighbour': 'representativeness_neighbour', 'field_model': 'embedding_selection_model', 'field_count': 'embedding_selection_count'}
class EmbeddingSelection:
 33class EmbeddingSelection:
 34    """Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html."""
 35
 36    def __init__(
 37        self,
 38        dataset,
 39        dataset_info,
 40        model_name,
 41        log_dir,
 42        embeddings_path="./output/embeddings/",
 43    ):
 44        """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection."""
 45
 46        # WandB counter
 47        self.steps = 0
 48        self.dataset = dataset
 49        self.brains = dataset.list_brain_runs()
 50        self.dataset_name = dataset_info["name"]
 51        self.v51_model_zoo = foz.list_zoo_models()
 52        self.writer = SummaryWriter(log_dir=log_dir)
 53
 54        # Model
 55        if model_name not in self.v51_model_zoo:
 56            logging.warning(
 57                "Model " + model_name + " is not part of the V51 model zoo."
 58            )
 59        self.model = foz.load_zoo_model(model_name)
 60
 61        # Keys
 62        self.model_name = model_name
 63        self.model_name_key = re.sub(r"[\W-]+", "_", model_name)
 64        self.embedding_key = "embedding_" + self.model_name_key
 65        self.similiarity_key = "simil_" + self.model_name_key
 66        self.uniqueness_key = "uniqueness_" + self.model_name_key
 67
 68        # Storing variables
 69        self.embeddings_vis = {}  # Multiple methods per model
 70        self.representativeness = {}  # Multiple methods per model
 71        self.embeddings_model = None
 72        self.similarities = None
 73
 74        # Generate folder to store all embedding-related results
 75        self.embeddings_root = embeddings_path + self.dataset_name + "/"
 76        Path(self.embeddings_root).mkdir(parents=True, exist_ok=True)
 77
 78        self.hf_repo_name = (
 79            f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}"
 80        )
 81
 82        # Add fields to dataset
 83        add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField)
 84        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField)
 85        # Float instead of Int for visualization style in UI, color gradient instead of color palette
 86        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField)
 87
 88        # Init count for samples only once. Is intilized with None by add_sample_field
 89        test_sample = self.dataset.first()
 90        if test_sample[BRAIN_TAXONOMY["field_count"]] is None:
 91            logging.info("Setting all selection counts to 0")
 92            zeros = [0] * len(self.dataset)  # Needs to be an iterablr
 93            self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True)
 94
 95        # Determine if model was already used for selection
 96        self.model_already_used = False
 97        dataset_schema = self.dataset.get_field_schema()
 98        if BRAIN_TAXONOMY["field_model"] in dataset_schema:
 99            field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"]))
100            if self.model_name_key in field_values:
101                self.model_already_used = True
102
103    def __del__(self):
104        """Destructor that decrements step counter and closes the writer."""
105        self.steps -= 1  # +1 after every function, need to decrement for final step
106        self.writer.close()
107
108    def compute_embeddings(self, mode):
109        """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods."""
110        start_time = time.time()
111
112        dim_reduction_methods = list(fob.brain_config.visualization_methods.keys())
113        dim_reduction_methods.remove("manual")
114
115        embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl"
116
117        if self.model.has_embeddings:
118            # Try to load models
119            load_models_successful = None
120            if mode == "load":
121                try:
122                    logging.info(
123                        f"Attempting to load embeddings for model {self.model_name_key}."
124                    )
125                    if self.dataset.get_field(self.embedding_key) is not None:
126                        logging.info("Loading embeddings from V51.")
127                        self.embeddings_model = self.dataset.values(self.embedding_key)
128                    elif os.path.exists(embedding_file_name):
129                        logging.info("Loading embeddings from disk.")
130                        with open(embedding_file_name, "rb") as f:
131                            self.embeddings_model = pickle.load(f)
132                        self.dataset.set_values(
133                            self.embedding_key, self.embeddings_model
134                        )
135                    else:
136                        logging.info(
137                            f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}"
138                        )
139                        model_name = f"{self.model_name_key}.pkl"
140                        embedding_file_name = hf_hub_download(
141                            repo_id=self.hf_repo_name,
142                            filename=model_name,
143                            local_dir=self.embeddings_root,
144                        )
145                        logging.info("Loading embeddings from disk.")
146                        with open(embedding_file_name, "rb") as f:
147                            self.embeddings_model = pickle.load(f)
148                        self.dataset.set_values(
149                            self.embedding_key, self.embeddings_model
150                        )
151                    load_models_successful = True
152                except Exception as e:
153                    logging.warning(f"Failed to load or download embeddings: {str(e)}")
154                    load_models_successful = False
155
156            if mode == "compute" or load_models_successful == False:
157                logging.info(f"Computing embeddings for model {self.model_name_key}.")
158                self.dataset.compute_embeddings(
159                    model=self.model, embeddings_field=self.embedding_key
160                )
161                self.embeddings_model = self.dataset.values(self.embedding_key)
162
163                self.dataset.set_values(self.embedding_key, self.embeddings_model)
164                with open(embedding_file_name, "wb") as f:
165                    pickle.dump(self.embeddings_model, f)
166
167                # Upload embeddings to Hugging Face
168                if HF_DO_UPLOAD == True:
169                    logging.info(
170                        f"Uploading embeddings to Hugging Face: {self.hf_repo_name}"
171                    )
172                    api = HfApi()
173                    api.create_repo(
174                        self.hf_repo_name,
175                        private=True,
176                        repo_type="model",
177                        exist_ok=True,
178                    )
179
180                    model_name = f"{self.model_name_key}.pkl"
181                    api.upload_file(
182                        path_or_fileobj=embedding_file_name,
183                        path_in_repo=model_name,
184                        repo_id=self.hf_repo_name,
185                        repo_type="model",
186                    )
187
188            if mode not in ["load", "compute"]:
189                logging.error(f"Mode {mode} is not supported.")
190
191            for method in tqdm(dim_reduction_methods, "Dimensionality reductions"):
192                method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method)
193                points_key = "points_" + method_key
194                vis_file_name = self.embeddings_root + method_key + ".pkl"
195
196                if method_key in self.brains:
197                    logging.info("Loading vis from V51.")
198                    brain_info = self.dataset.get_brain_info(method_key)
199                    self.embeddings_vis[method_key] = self.dataset.load_brain_results(
200                        method_key
201                    )
202
203                elif os.path.exists(vis_file_name):
204                    logging.info("Loading vis from disk.")
205                    with open(vis_file_name, "rb") as f:
206                        points = pickle.load(f)
207
208                    self.embeddings_vis[method_key] = fob.compute_visualization(
209                        self.dataset,
210                        method=method,
211                        points=points,
212                        embeddings=self.embedding_key,
213                        seed=GLOBAL_SEED,
214                        brain_key=method_key,
215                        num_workers=NUM_WORKERS,
216                    )
217                    self.dataset.set_values(
218                        points_key, self.embeddings_vis[method_key].current_points
219                    )
220
221                else:
222                    logging.info("Computing vis.")
223                    self.embeddings_vis[method_key] = fob.compute_visualization(
224                        self.dataset,
225                        method=method,
226                        embeddings=self.embedding_key,
227                        seed=GLOBAL_SEED,
228                        brain_key=method_key,
229                        num_workers=NUM_WORKERS,
230                    )
231                    self.dataset.set_values(
232                        points_key, self.embeddings_vis[method_key].current_points
233                    )
234
235                    with open(vis_file_name, "wb") as f:
236                        pickle.dump(self.embeddings_vis[method_key].current_points, f)
237        else:
238            logging.warning(
239                "Model " + self.model_name + " does not provide embeddings."
240            )
241        end_time = time.time()
242        duration = end_time - start_time
243        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
244        self.steps += 1
245
246    def compute_similarity(self):
247        """Computes the similarity of embeddings for the dataset."""
248
249        start_time = time.time()
250        if self.similiarity_key in self.brains:
251            logging.info("Loading similarities from V51.")
252            self.similarities = self.dataset.load_brain_results(self.similiarity_key)
253
254        else:
255            logging.info("Computing similarities.")
256            self.similarities = fob.compute_similarity(
257                self.dataset,
258                embeddings=self.embeddings_model,
259                brain_key=self.similiarity_key,
260                num_workers=NUM_WORKERS,
261            )
262        end_time = time.time()
263        duration = end_time - start_time
264        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
265        self.steps += 1
266
267    def compute_representativeness(self, threshold):
268        """
269        Computes the representativeness of frames in the dataset.
270
271        References:
272            - https://docs.voxel51.com/brain.html#image-representativeness
273        """
274
275        start_time = time.time()
276        field = BRAIN_TAXONOMY["field"]
277        field_model = BRAIN_TAXONOMY["field_model"]
278        field_count = BRAIN_TAXONOMY["field_count"]
279        value = BRAIN_TAXONOMY["value_compute_representativeness"]
280        methods_cluster_center = ["cluster-center", "cluster-center-downweight"]
281
282        for method in tqdm(methods_cluster_center, desc="Representativeness"):
283            method_key = re.sub(
284                r"[\W-]+",
285                "_",
286                "representativeness_" + self.model_name + "_" + method,
287            )
288
289            if method_key in self.brains:
290                self.representativeness[method_key] = self.dataset.load_brain_results(
291                    method_key
292                )
293
294            logging.info("Computing representativeness.")
295            fob.compute_representativeness(
296                self.dataset,
297                representativeness_field=method_key,
298                method=method,
299                embeddings=self.embeddings_model,
300                num_workers=NUM_WORKERS,
301                progress=True,
302            )
303
304            # quant_threshold = self.dataset.quantiles(key, threshold)
305            # view = self.dataset.match(F(key) >= quant_threshold)
306            view = self.dataset.match(F(method_key) >= threshold)
307            for sample in view.iter_samples(progress=True, autosave=True):
308                if sample[field] is None:
309                    sample[field] = value
310                    sample[field_model] = self.model_name_key
311                    sample[field_count] += 1
312                else:
313                    sample[field_count] += 1
314        end_time = time.time()
315        duration = end_time - start_time
316        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
317        self.steps += 1
318
319    def compute_unique_images_greedy(self, perct_unique):
320        """
321        Computes a subset of unique images from the dataset using a greedy algorithm.
322
323        References:
324            - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example
325        """
326
327        start_time = time.time()
328        sample_count = len(self.dataset.view())
329        num_of_unique = perct_unique * sample_count
330        field = BRAIN_TAXONOMY["field"]
331        field_model = BRAIN_TAXONOMY["field_model"]
332        field_count = BRAIN_TAXONOMY["field_count"]
333        value = BRAIN_TAXONOMY["value_find_unique"]
334
335        # Check if any sample has the label label_unique:
336        dataset_labels = self.dataset.count_sample_tags()
337        center_view = self.dataset.match(F(field) == value)
338
339        if field in dataset_labels and len(center_view) > 0:
340            logging.info("No unique images.")
341            pass
342
343        else:
344            self.similarities.find_unique(num_of_unique)
345            for unique_id in tqdm(
346                self.similarities.unique_ids, desc="Tagging unique images"
347            ):
348                sample = self.dataset[unique_id]
349                if sample[field] is None:
350                    sample[field] = value
351                    sample[field_model] = self.model_name_key
352                    sample[field_count] += 1
353                else:
354                    sample[field_count] += 1
355                sample.save()
356
357        end_time = time.time()
358        duration = end_time - start_time
359        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
360        self.steps += 1
361
362    def compute_unique_images_deterministic(self, threshold):
363        """
364        Computes a deterministic uniqueness score for each sample in the dataset.
365
366        References:
367            - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness
368        """
369
370        start_time = time.time()
371        field = BRAIN_TAXONOMY["field"]
372        field_model = BRAIN_TAXONOMY["field_model"]
373        field_count = BRAIN_TAXONOMY["field_count"]
374        value = BRAIN_TAXONOMY["value_compute_uniqueness"]
375
376        fob.compute_uniqueness(
377            self.dataset,
378            embeddings=self.embeddings_model,
379            uniqueness_field=self.uniqueness_key,
380            num_workers=NUM_WORKERS,
381        )
382
383        # quant_threshold = self.dataset.quantiles(key, threshold)
384        # view = self.dataset.match(F(key) >= quant_threshold)
385        view = self.dataset.match(F(self.uniqueness_key) >= threshold)
386        for sample in view.iter_samples(progress=True, autosave=True):
387            if sample[field] is None:
388                sample[field] = value
389                sample[field_model] = self.model_name_key
390                sample[field_count] += 1
391            else:
392                sample[field_count] += 1
393        end_time = time.time()
394        duration = end_time - start_time
395        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
396        self.steps += 1
397
398    def compute_similar_images(self, dist_threshold, neighbour_count):
399        """Computes and assigns similar images based on a distance threshold and neighbour count."""
400        start_time = time.time()
401        field = BRAIN_TAXONOMY["field"]
402        field_model = BRAIN_TAXONOMY["field_model"]
403        field_count = BRAIN_TAXONOMY["field_count"]
404        field_neighbour_distance = "distance"
405
406        value_find_unique = BRAIN_TAXONOMY["value_find_unique"]
407        value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"]
408        value_compute_representativeness = BRAIN_TAXONOMY[
409            "value_compute_representativeness"
410        ]
411
412        value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"]
413        value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[
414            "value_compute_uniqueness_neighbour"
415        ]
416        value_compute_representativeness_neighbour = BRAIN_TAXONOMY[
417            "value_compute_representativeness_neighbour"
418        ]
419
420        # Check if samples have already assigned fields
421        dataset_labels = self.dataset.count_sample_tags()
422        neighbour_view_greedy = self.dataset.match(
423            F(field) == value_find_unique_neighbour
424        )
425        neighbour_view_deterministic = self.dataset.match(
426            F(field) == value_compute_uniqueness_neighbour
427        )
428        neighbour_view_representativeness = self.dataset.match(
429            F(field) == value_compute_representativeness_neighbour
430        )
431
432        if field in dataset_labels and (
433            len(neighbour_view_greedy) > 0
434            and len(neighbour_view_deterministic) > 0
435            and len(neighbour_view_representativeness) > 0
436        ):
437            pass
438
439        else:
440            unique_view_greedy = self.dataset.match(F(field) == value_find_unique)
441            unique_view_deterministic = self.dataset.match(
442                F(field) == value_compute_uniqueness
443            )
444            unique_view_representativeness = self.dataset.match(
445                F(field) == value_compute_representativeness
446            )
447
448            views_values = [
449                (unique_view_greedy, value_find_unique_neighbour),
450                (unique_view_deterministic, value_compute_uniqueness_neighbour),
451                (
452                    unique_view_representativeness,
453                    value_compute_representativeness_neighbour,
454                ),
455            ]
456
457            for unique_view, value in tqdm(views_values, desc="Tagging similar images"):
458                for sample in unique_view:
459                    view = self.dataset.sort_by_similarity(
460                        sample.id,
461                        k=neighbour_count,
462                        brain_key=self.similiarity_key,
463                        dist_field=field_neighbour_distance,
464                    )
465                    for sample_neighbour in view:
466                        distance = sample_neighbour[field_neighbour_distance]
467                        if distance < dist_threshold:
468                            if sample_neighbour[field] is None:
469                                sample_neighbour[field] = value
470                                sample_neighbour[field_model] = self.model_name_key
471                                sample_neighbour[field_count] += 1
472                            else:
473                                sample_neighbour[field_count] += 1
474                            sample_neighbour.save()
475
476        end_time = time.time()
477        duration = end_time - start_time
478        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
479        self.steps += 1

Class for computing and managing embeddings, uniqueness, and representations for dataset samples with https://docs.voxel51.com/brain.html.

EmbeddingSelection( dataset, dataset_info, model_name, log_dir, embeddings_path='./output/embeddings/')
 36    def __init__(
 37        self,
 38        dataset,
 39        dataset_info,
 40        model_name,
 41        log_dir,
 42        embeddings_path="./output/embeddings/",
 43    ):
 44        """Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection."""
 45
 46        # WandB counter
 47        self.steps = 0
 48        self.dataset = dataset
 49        self.brains = dataset.list_brain_runs()
 50        self.dataset_name = dataset_info["name"]
 51        self.v51_model_zoo = foz.list_zoo_models()
 52        self.writer = SummaryWriter(log_dir=log_dir)
 53
 54        # Model
 55        if model_name not in self.v51_model_zoo:
 56            logging.warning(
 57                "Model " + model_name + " is not part of the V51 model zoo."
 58            )
 59        self.model = foz.load_zoo_model(model_name)
 60
 61        # Keys
 62        self.model_name = model_name
 63        self.model_name_key = re.sub(r"[\W-]+", "_", model_name)
 64        self.embedding_key = "embedding_" + self.model_name_key
 65        self.similiarity_key = "simil_" + self.model_name_key
 66        self.uniqueness_key = "uniqueness_" + self.model_name_key
 67
 68        # Storing variables
 69        self.embeddings_vis = {}  # Multiple methods per model
 70        self.representativeness = {}  # Multiple methods per model
 71        self.embeddings_model = None
 72        self.similarities = None
 73
 74        # Generate folder to store all embedding-related results
 75        self.embeddings_root = embeddings_path + self.dataset_name + "/"
 76        Path(self.embeddings_root).mkdir(parents=True, exist_ok=True)
 77
 78        self.hf_repo_name = (
 79            f"{HF_ROOT}/{self.dataset_name}_embedding_{self.model_name_key}"
 80        )
 81
 82        # Add fields to dataset
 83        add_sample_field(self.dataset, BRAIN_TAXONOMY["field"], fo.StringField)
 84        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_model"], fo.StringField)
 85        # Float instead of Int for visualization style in UI, color gradient instead of color palette
 86        add_sample_field(self.dataset, BRAIN_TAXONOMY["field_count"], fo.FloatField)
 87
 88        # Init count for samples only once. Is intilized with None by add_sample_field
 89        test_sample = self.dataset.first()
 90        if test_sample[BRAIN_TAXONOMY["field_count"]] is None:
 91            logging.info("Setting all selection counts to 0")
 92            zeros = [0] * len(self.dataset)  # Needs to be an iterablr
 93            self.dataset.set_values(BRAIN_TAXONOMY["field_count"], zeros, progress=True)
 94
 95        # Determine if model was already used for selection
 96        self.model_already_used = False
 97        dataset_schema = self.dataset.get_field_schema()
 98        if BRAIN_TAXONOMY["field_model"] in dataset_schema:
 99            field_values = set(self.dataset.values(BRAIN_TAXONOMY["field_model"]))
100            if self.model_name_key in field_values:
101                self.model_already_used = True

Initialize the EmbeddingSelection with dataset, model, and configuration for embedding-based data selection.

steps
dataset
brains
dataset_name
v51_model_zoo
writer
model
model_name
model_name_key
embedding_key
similiarity_key
uniqueness_key
embeddings_vis
representativeness
embeddings_model
similarities
embeddings_root
hf_repo_name
model_already_used
def compute_embeddings(self, mode):
108    def compute_embeddings(self, mode):
109        """Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods."""
110        start_time = time.time()
111
112        dim_reduction_methods = list(fob.brain_config.visualization_methods.keys())
113        dim_reduction_methods.remove("manual")
114
115        embedding_file_name = self.embeddings_root + self.model_name_key + ".pkl"
116
117        if self.model.has_embeddings:
118            # Try to load models
119            load_models_successful = None
120            if mode == "load":
121                try:
122                    logging.info(
123                        f"Attempting to load embeddings for model {self.model_name_key}."
124                    )
125                    if self.dataset.get_field(self.embedding_key) is not None:
126                        logging.info("Loading embeddings from V51.")
127                        self.embeddings_model = self.dataset.values(self.embedding_key)
128                    elif os.path.exists(embedding_file_name):
129                        logging.info("Loading embeddings from disk.")
130                        with open(embedding_file_name, "rb") as f:
131                            self.embeddings_model = pickle.load(f)
132                        self.dataset.set_values(
133                            self.embedding_key, self.embeddings_model
134                        )
135                    else:
136                        logging.info(
137                            f"Downloading embeddings {self.hf_repo_name} from Hugging Face to {self.embeddings_root}"
138                        )
139                        model_name = f"{self.model_name_key}.pkl"
140                        embedding_file_name = hf_hub_download(
141                            repo_id=self.hf_repo_name,
142                            filename=model_name,
143                            local_dir=self.embeddings_root,
144                        )
145                        logging.info("Loading embeddings from disk.")
146                        with open(embedding_file_name, "rb") as f:
147                            self.embeddings_model = pickle.load(f)
148                        self.dataset.set_values(
149                            self.embedding_key, self.embeddings_model
150                        )
151                    load_models_successful = True
152                except Exception as e:
153                    logging.warning(f"Failed to load or download embeddings: {str(e)}")
154                    load_models_successful = False
155
156            if mode == "compute" or load_models_successful == False:
157                logging.info(f"Computing embeddings for model {self.model_name_key}.")
158                self.dataset.compute_embeddings(
159                    model=self.model, embeddings_field=self.embedding_key
160                )
161                self.embeddings_model = self.dataset.values(self.embedding_key)
162
163                self.dataset.set_values(self.embedding_key, self.embeddings_model)
164                with open(embedding_file_name, "wb") as f:
165                    pickle.dump(self.embeddings_model, f)
166
167                # Upload embeddings to Hugging Face
168                if HF_DO_UPLOAD == True:
169                    logging.info(
170                        f"Uploading embeddings to Hugging Face: {self.hf_repo_name}"
171                    )
172                    api = HfApi()
173                    api.create_repo(
174                        self.hf_repo_name,
175                        private=True,
176                        repo_type="model",
177                        exist_ok=True,
178                    )
179
180                    model_name = f"{self.model_name_key}.pkl"
181                    api.upload_file(
182                        path_or_fileobj=embedding_file_name,
183                        path_in_repo=model_name,
184                        repo_id=self.hf_repo_name,
185                        repo_type="model",
186                    )
187
188            if mode not in ["load", "compute"]:
189                logging.error(f"Mode {mode} is not supported.")
190
191            for method in tqdm(dim_reduction_methods, "Dimensionality reductions"):
192                method_key = self.model_name_key + "_" + re.sub(r"[\W-]+", "_", method)
193                points_key = "points_" + method_key
194                vis_file_name = self.embeddings_root + method_key + ".pkl"
195
196                if method_key in self.brains:
197                    logging.info("Loading vis from V51.")
198                    brain_info = self.dataset.get_brain_info(method_key)
199                    self.embeddings_vis[method_key] = self.dataset.load_brain_results(
200                        method_key
201                    )
202
203                elif os.path.exists(vis_file_name):
204                    logging.info("Loading vis from disk.")
205                    with open(vis_file_name, "rb") as f:
206                        points = pickle.load(f)
207
208                    self.embeddings_vis[method_key] = fob.compute_visualization(
209                        self.dataset,
210                        method=method,
211                        points=points,
212                        embeddings=self.embedding_key,
213                        seed=GLOBAL_SEED,
214                        brain_key=method_key,
215                        num_workers=NUM_WORKERS,
216                    )
217                    self.dataset.set_values(
218                        points_key, self.embeddings_vis[method_key].current_points
219                    )
220
221                else:
222                    logging.info("Computing vis.")
223                    self.embeddings_vis[method_key] = fob.compute_visualization(
224                        self.dataset,
225                        method=method,
226                        embeddings=self.embedding_key,
227                        seed=GLOBAL_SEED,
228                        brain_key=method_key,
229                        num_workers=NUM_WORKERS,
230                    )
231                    self.dataset.set_values(
232                        points_key, self.embeddings_vis[method_key].current_points
233                    )
234
235                    with open(vis_file_name, "wb") as f:
236                        pickle.dump(self.embeddings_vis[method_key].current_points, f)
237        else:
238            logging.warning(
239                "Model " + self.model_name + " does not provide embeddings."
240            )
241        end_time = time.time()
242        duration = end_time - start_time
243        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
244        self.steps += 1

Computes and stores embeddings for the given model name. Uses V51 pre-defined dim. reduction methods.

def compute_similarity(self):
246    def compute_similarity(self):
247        """Computes the similarity of embeddings for the dataset."""
248
249        start_time = time.time()
250        if self.similiarity_key in self.brains:
251            logging.info("Loading similarities from V51.")
252            self.similarities = self.dataset.load_brain_results(self.similiarity_key)
253
254        else:
255            logging.info("Computing similarities.")
256            self.similarities = fob.compute_similarity(
257                self.dataset,
258                embeddings=self.embeddings_model,
259                brain_key=self.similiarity_key,
260                num_workers=NUM_WORKERS,
261            )
262        end_time = time.time()
263        duration = end_time - start_time
264        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
265        self.steps += 1

Computes the similarity of embeddings for the dataset.

def compute_representativeness(self, threshold):
267    def compute_representativeness(self, threshold):
268        """
269        Computes the representativeness of frames in the dataset.
270
271        References:
272            - https://docs.voxel51.com/brain.html#image-representativeness
273        """
274
275        start_time = time.time()
276        field = BRAIN_TAXONOMY["field"]
277        field_model = BRAIN_TAXONOMY["field_model"]
278        field_count = BRAIN_TAXONOMY["field_count"]
279        value = BRAIN_TAXONOMY["value_compute_representativeness"]
280        methods_cluster_center = ["cluster-center", "cluster-center-downweight"]
281
282        for method in tqdm(methods_cluster_center, desc="Representativeness"):
283            method_key = re.sub(
284                r"[\W-]+",
285                "_",
286                "representativeness_" + self.model_name + "_" + method,
287            )
288
289            if method_key in self.brains:
290                self.representativeness[method_key] = self.dataset.load_brain_results(
291                    method_key
292                )
293
294            logging.info("Computing representativeness.")
295            fob.compute_representativeness(
296                self.dataset,
297                representativeness_field=method_key,
298                method=method,
299                embeddings=self.embeddings_model,
300                num_workers=NUM_WORKERS,
301                progress=True,
302            )
303
304            # quant_threshold = self.dataset.quantiles(key, threshold)
305            # view = self.dataset.match(F(key) >= quant_threshold)
306            view = self.dataset.match(F(method_key) >= threshold)
307            for sample in view.iter_samples(progress=True, autosave=True):
308                if sample[field] is None:
309                    sample[field] = value
310                    sample[field_model] = self.model_name_key
311                    sample[field_count] += 1
312                else:
313                    sample[field_count] += 1
314        end_time = time.time()
315        duration = end_time - start_time
316        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
317        self.steps += 1

Computes the representativeness of frames in the dataset.

References: - https://docs.voxel51.com/brain.html#image-representativeness

def compute_unique_images_greedy(self, perct_unique):
319    def compute_unique_images_greedy(self, perct_unique):
320        """
321        Computes a subset of unique images from the dataset using a greedy algorithm.
322
323        References:
324            - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example
325        """
326
327        start_time = time.time()
328        sample_count = len(self.dataset.view())
329        num_of_unique = perct_unique * sample_count
330        field = BRAIN_TAXONOMY["field"]
331        field_model = BRAIN_TAXONOMY["field_model"]
332        field_count = BRAIN_TAXONOMY["field_count"]
333        value = BRAIN_TAXONOMY["value_find_unique"]
334
335        # Check if any sample has the label label_unique:
336        dataset_labels = self.dataset.count_sample_tags()
337        center_view = self.dataset.match(F(field) == value)
338
339        if field in dataset_labels and len(center_view) > 0:
340            logging.info("No unique images.")
341            pass
342
343        else:
344            self.similarities.find_unique(num_of_unique)
345            for unique_id in tqdm(
346                self.similarities.unique_ids, desc="Tagging unique images"
347            ):
348                sample = self.dataset[unique_id]
349                if sample[field] is None:
350                    sample[field] = value
351                    sample[field_model] = self.model_name_key
352                    sample[field_count] += 1
353                else:
354                    sample[field_count] += 1
355                sample.save()
356
357        end_time = time.time()
358        duration = end_time - start_time
359        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
360        self.steps += 1

Computes a subset of unique images from the dataset using a greedy algorithm.

References: - https://docs.voxel51.com/user_guide/brain.html#cifar-10-example

def compute_unique_images_deterministic(self, threshold):
362    def compute_unique_images_deterministic(self, threshold):
363        """
364        Computes a deterministic uniqueness score for each sample in the dataset.
365
366        References:
367            - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness
368        """
369
370        start_time = time.time()
371        field = BRAIN_TAXONOMY["field"]
372        field_model = BRAIN_TAXONOMY["field_model"]
373        field_count = BRAIN_TAXONOMY["field_count"]
374        value = BRAIN_TAXONOMY["value_compute_uniqueness"]
375
376        fob.compute_uniqueness(
377            self.dataset,
378            embeddings=self.embeddings_model,
379            uniqueness_field=self.uniqueness_key,
380            num_workers=NUM_WORKERS,
381        )
382
383        # quant_threshold = self.dataset.quantiles(key, threshold)
384        # view = self.dataset.match(F(key) >= quant_threshold)
385        view = self.dataset.match(F(self.uniqueness_key) >= threshold)
386        for sample in view.iter_samples(progress=True, autosave=True):
387            if sample[field] is None:
388                sample[field] = value
389                sample[field_model] = self.model_name_key
390                sample[field_count] += 1
391            else:
392                sample[field_count] += 1
393        end_time = time.time()
394        duration = end_time - start_time
395        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
396        self.steps += 1

Computes a deterministic uniqueness score for each sample in the dataset.

References: - https://docs.voxel51.com/api/fiftyone.brain.html#fiftyone.brain.compute_uniqueness

def compute_similar_images(self, dist_threshold, neighbour_count):
398    def compute_similar_images(self, dist_threshold, neighbour_count):
399        """Computes and assigns similar images based on a distance threshold and neighbour count."""
400        start_time = time.time()
401        field = BRAIN_TAXONOMY["field"]
402        field_model = BRAIN_TAXONOMY["field_model"]
403        field_count = BRAIN_TAXONOMY["field_count"]
404        field_neighbour_distance = "distance"
405
406        value_find_unique = BRAIN_TAXONOMY["value_find_unique"]
407        value_compute_uniqueness = BRAIN_TAXONOMY["value_compute_uniqueness"]
408        value_compute_representativeness = BRAIN_TAXONOMY[
409            "value_compute_representativeness"
410        ]
411
412        value_find_unique_neighbour = BRAIN_TAXONOMY["value_find_unique_neighbour"]
413        value_compute_uniqueness_neighbour = BRAIN_TAXONOMY[
414            "value_compute_uniqueness_neighbour"
415        ]
416        value_compute_representativeness_neighbour = BRAIN_TAXONOMY[
417            "value_compute_representativeness_neighbour"
418        ]
419
420        # Check if samples have already assigned fields
421        dataset_labels = self.dataset.count_sample_tags()
422        neighbour_view_greedy = self.dataset.match(
423            F(field) == value_find_unique_neighbour
424        )
425        neighbour_view_deterministic = self.dataset.match(
426            F(field) == value_compute_uniqueness_neighbour
427        )
428        neighbour_view_representativeness = self.dataset.match(
429            F(field) == value_compute_representativeness_neighbour
430        )
431
432        if field in dataset_labels and (
433            len(neighbour_view_greedy) > 0
434            and len(neighbour_view_deterministic) > 0
435            and len(neighbour_view_representativeness) > 0
436        ):
437            pass
438
439        else:
440            unique_view_greedy = self.dataset.match(F(field) == value_find_unique)
441            unique_view_deterministic = self.dataset.match(
442                F(field) == value_compute_uniqueness
443            )
444            unique_view_representativeness = self.dataset.match(
445                F(field) == value_compute_representativeness
446            )
447
448            views_values = [
449                (unique_view_greedy, value_find_unique_neighbour),
450                (unique_view_deterministic, value_compute_uniqueness_neighbour),
451                (
452                    unique_view_representativeness,
453                    value_compute_representativeness_neighbour,
454                ),
455            ]
456
457            for unique_view, value in tqdm(views_values, desc="Tagging similar images"):
458                for sample in unique_view:
459                    view = self.dataset.sort_by_similarity(
460                        sample.id,
461                        k=neighbour_count,
462                        brain_key=self.similiarity_key,
463                        dist_field=field_neighbour_distance,
464                    )
465                    for sample_neighbour in view:
466                        distance = sample_neighbour[field_neighbour_distance]
467                        if distance < dist_threshold:
468                            if sample_neighbour[field] is None:
469                                sample_neighbour[field] = value
470                                sample_neighbour[field_model] = self.model_name_key
471                                sample_neighbour[field_count] += 1
472                            else:
473                                sample_neighbour[field_count] += 1
474                            sample_neighbour.save()
475
476        end_time = time.time()
477        duration = end_time - start_time
478        self.writer.add_scalar("brain/duration_in_seconds", duration, self.steps)
479        self.steps += 1

Computes and assigns similar images based on a distance threshold and neighbour count.