utils.dataset_loader
1import datetime 2import logging 3import os 4import re 5from typing import List, Union 6from glob import glob 7import numpy as np 8from PIL import Image 9 10import fiftyone as fo 11import yaml 12from fiftyone.utils.huggingface import load_from_hub 13from nuscenes.nuscenes import NuScenes 14 15from config.config import ACCEPTED_SPLITS, GLOBAL_SEED, NUM_WORKERS, PERSISTENT 16from utils.custom_view import max_detections, subset_splits, vru_mcity_fisheye 17from utils.sample_field_operations import rename_sample_field 18 19 20def get_supported_datasets(config_path="config/datasets.yaml"): 21 """Returns a list of supported dataset names from the config file.""" 22 try: 23 with open(config_path, "r") as file: 24 config = yaml.safe_load(file) 25 26 return [dataset["name"] for dataset in config["datasets"]] 27 except Exception as e: 28 logging.error(f"Available datasets could not be retrieved: {e}") 29 30 31def load_dataset(selected_dataset: str, n_iteration=0) -> fo.Dataset: 32 """Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions.""" 33 dataset_info = load_dataset_info(selected_dataset["name"]) 34 35 if dataset_info: 36 loader_function = dataset_info.get("loader_fct") 37 dataset = globals()[loader_function](dataset_info) 38 n_samples_original = len(dataset) 39 n_samples_requested = selected_dataset["n_samples"] 40 custom_view_requested = selected_dataset["custom_view"] 41 42 if ( 43 n_samples_requested is not None 44 and n_samples_requested <= n_samples_original 45 ): 46 logging.info("Dataset reduction in process.") 47 # Make sure that the reduced datasets has samples from every available split 48 split_views = [] 49 50 # Get split distribution 51 tags_count_dataset_dict = dataset.count_sample_tags() 52 for tag in tags_count_dataset_dict: 53 if tag in ACCEPTED_SPLITS: 54 count = tags_count_dataset_dict[tag] 55 percentage = count / n_samples_original 56 n_split_samples = int(n_samples_requested * percentage) 57 logging.info(f"Split {tag}: {n_split_samples} samples") 58 59 split_view = dataset.match_tags(tag).limit(n_split_samples) 60 split_views.append(split_view) 61 62 # Concatenate views properly 63 if split_views: 64 combined_view = split_views[0] 65 for view in split_views[1:]: 66 combined_view = combined_view.concat(view) 67 68 # Fill dataset if smaller than requested 69 if len(combined_view) < n_samples_requested: 70 n_samples_needed = n_samples_requested - len(combined_view) 71 view_random = dataset.take(n_samples_needed, seed=GLOBAL_SEED) 72 combined_view = combined_view.concat(view_random) 73 74 logging.warning( 75 f"Dataset size was reduced from {len(dataset)} to {len(combined_view)} samples." 76 ) 77 return combined_view, dataset_info 78 79 elif custom_view_requested is not None: 80 try: 81 logging.warning(f"Applying custom view {custom_view_requested}.") 82 dataset_view = globals()[custom_view_requested](dataset, n_iteration) 83 return dataset_view, dataset_info 84 except Exception as e: 85 logging.error( 86 f"Calling the custom view {custom_view_requested} failed: {e}" 87 ) 88 89 else: 90 logging.error( 91 str(selected_dataset["name"]) 92 + " is not a valid dataset name. Check supported datasets in datasets.yaml." 93 ) 94 95 return dataset, dataset_info 96 97 98def get_split(v51_sample: Union[fo.core.sample.Sample, List[str]]) -> str: 99 """Gets dataset split (train, val, test) from a sample's tags or list of tags.""" 100 if isinstance(v51_sample, fo.core.sample.Sample): 101 sample_tags = v51_sample.tags 102 elif isinstance(v51_sample, list): 103 sample_tags = v51_sample 104 else: 105 logging.error( 106 f"Type {isinstance(v51_sample)} is not supported for split retrieval." 107 ) 108 109 found_splits = [split for split in ACCEPTED_SPLITS if split in sample_tags] 110 111 if len(found_splits) == 0: 112 logging.warning(f"No split found in sample tags: {sample_tags}") 113 return None 114 elif len(found_splits) > 1: 115 logging.warning(f"Multiple splits found in sample tags: '{found_splits}'") 116 return None 117 else: 118 split = found_splits[0] 119 return split 120 121 122def _separate_split(dataset, current_split, new_split, split_ratio=2): 123 """Separates a portion of samples from the current split in the dataset and assigns them to a new split.""" 124 # Select samples for split change 125 view_current_split = dataset.match_tags(current_split) 126 n_samples_current_split = len(view_current_split) 127 view_new_split = view_current_split.take( 128 int(n_samples_current_split / split_ratio), seed=GLOBAL_SEED 129 ) 130 view_new_split.tag_samples(new_split) 131 view_new_split.untag_samples(current_split) 132 133 # Get number of samples in each split 134 view_current_split_changed = dataset.match_tags(current_split) 135 n_samples_current_split_changed = len(view_current_split_changed) 136 view_new_split = dataset.match_tags(new_split) 137 n_samples_new_split = len(view_new_split) 138 139 return n_samples_current_split, n_samples_current_split_changed, n_samples_new_split 140 141 142def _align_splits(dataset): 143 """Standardize dataset splits by renaming and creating missing splits (train/val/test) as needed.""" 144 SUPPORTED_SPLITS = ["train", "training", "val", "validation", "test", "testing"] 145 tags = dataset.distinct("tags") 146 splits = [tag for tag in tags if tag in SUPPORTED_SPLITS] 147 148 # Rename splits if necessary 149 rename_mapping = {"training": "train", "validation": "val", "testing": "test"} 150 151 for old_tag, new_tag in rename_mapping.items(): 152 if old_tag in splits: 153 dataset.rename_tag(old_tag, new_tag) 154 splits = [tag if tag != old_tag else new_tag for tag in splits] 155 156 # If only val or only test, create val and test splits 157 if "val" in splits and "test" not in splits: 158 ( 159 n_samples_current_split, 160 n_samples_current_split_changed, 161 n_samples_new_split, 162 ) = _separate_split(dataset, current_split="val", new_split="test") 163 logging.warning( 164 f"Dataset had no 'test' split. Split {n_samples_current_split} 'val' into {n_samples_current_split_changed} 'val' and {n_samples_new_split} 'test'." 165 ) 166 167 elif "test" in splits and "val" not in splits: 168 ( 169 n_samples_current_split, 170 n_samples_current_split_changed, 171 n_samples_new_split, 172 ) = _separate_split(dataset, current_split="test", new_split="val") 173 logging.warning( 174 f"Dataset had no 'val' split. Split {n_samples_current_split} 'test' into {n_samples_current_split_changed} 'val' and {n_samples_new_split} 'test'." 175 ) 176 if "train" in splits and "test" not in splits and "val" not in splits: 177 logging.warning( 178 "Found 'train' split, but 'test' and 'val' splits are missing. Training might fail." 179 ) 180 181 # Logging of available splits 182 tags = dataset.distinct("tags") 183 splits = [tag for tag in tags if tag in ACCEPTED_SPLITS] 184 logging.info(f"Available splits: {splits}") 185 186 return splits 187 188 189def _align_ground_truth(dataset, gt_field="ground_truth"): 190 """Ensures dataset has ground truth field named correctly, renaming single label field if found.""" 191 192 dataset_fields = dataset.get_field_schema() 193 if gt_field not in dataset_fields: 194 FIFTYONE_DEFAULT_FIELDS = [ 195 "id", 196 "filepath", 197 "tags", 198 "metadata", 199 "created_at", 200 "last_modified_at", 201 ] 202 non_default_fields = { 203 k: v for k, v in dataset_fields.items() if k not in FIFTYONE_DEFAULT_FIELDS 204 } 205 label_fields = { 206 k: v 207 for k, v in non_default_fields.items() 208 if isinstance(v, fo.EmbeddedDocumentField) 209 and issubclass(v.document_type, fo.core.labels.Label) 210 } 211 if len(label_fields) == 1: 212 gt_label_old = next(iter(label_fields)) 213 rename_sample_field(dataset, gt_label_old, gt_field) 214 logging.warning( 215 f"Label field '{gt_label_old}' renamed to '{gt_field}' for training." 216 ) 217 elif len(label_fields) > 1: 218 logging.warning( 219 f"The dataset has {len(label_fields)} fields with detections: {label_fields}. Rename one to {gt_field} with the command 'dataset.rename_sample_field(<your_field>, {gt_field})' to use it for training." 220 ) 221 222 223def _post_process_dataset(dataset): 224 """Post-processes the dataset by setting persistence, computing metadata, aligning splits, and aligning ground truth.""" 225 logging.info(f"Running dataset post-processing.") 226 # Set persistance 227 # https://docs.voxel51.com/user_guide/using_datasets.html#dataset-persistence 228 dataset.persistent = PERSISTENT 229 230 # Compute metadata 231 dataset.compute_metadata(num_workers=NUM_WORKERS, overwrite=False, progress=True) 232 233 # Align split names 234 splits = _align_splits(dataset) 235 236 # Align ground truth field 237 _align_ground_truth(dataset) 238 239 return dataset 240 241 242def load_dataset_info(dataset_name, config_path="./config/datasets.yaml"): 243 """Load dataset information from a YAML configuration file.""" 244 logging.info(f"Currently active V51 datasets: {fo.list_datasets()}") 245 with open(config_path) as f: 246 datasets_config = yaml.safe_load(f) 247 248 datasets = datasets_config["datasets"] 249 dataset_info = next((ds for ds in datasets if ds["name"] == dataset_name), None) 250 251 if dataset_info: 252 return dataset_info 253 else: 254 return None 255 256 257def load_annarbor_rolling(dataset_info): 258 """Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist.""" 259 dataset_name = dataset_info["name"] 260 dataset_dir = dataset_info["local_path"] 261 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 262 263 if dataset_name in fo.list_datasets(): 264 dataset = fo.load_dataset(dataset_name) 265 logging.info("Existing dataset " + dataset_name + " was loaded.") 266 else: 267 dataset = fo.Dataset(dataset_name) 268 dataset.add_dir( 269 dataset_dir=dataset_dir, 270 dataset_type=dataset_type, 271 ) 272 _post_process_dataset(dataset) 273 274 return dataset 275 276 277def load_mcity_fisheye_2000(dataset_info): 278 """Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset.""" 279 dataset_name = dataset_info["name"] 280 dataset_dir = dataset_info["local_path"] 281 hf_dataset_name = dataset_info.get("hf_dataset_name", None) 282 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 283 dataset_splits = dataset_info["v51_splits"] 284 285 if dataset_name in fo.list_datasets(): 286 dataset = fo.load_dataset(dataset_name) 287 logging.info("Existing dataset " + dataset_name + " was loaded.") 288 elif hf_dataset_name is not None: 289 # Read API key for HF access 290 hf_token = None 291 try: 292 with open(".secret", "r") as f: 293 for line in f: 294 if line.startswith("HF_TOKEN="): 295 hf_token = line.split("=")[1].strip() 296 except FileNotFoundError: 297 logging.error( 298 "'.secret' file not found. Please create it to load private datasets." 299 ) 300 hf_token = None 301 302 if hf_token is None: 303 logging.error( 304 "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets." 305 ) 306 dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token) 307 _post_process_dataset(dataset) 308 else: 309 dataset = fo.Dataset(dataset_name) 310 for split in dataset_splits: 311 dataset.add_dir( 312 dataset_dir=dataset_dir, 313 dataset_type=dataset_type, 314 split=split, 315 tags=split, 316 ) 317 318 # Add dataset specific metadata based on filename 319 for sample in dataset.iter_samples(progress=True, autosave=True): 320 metadata = _process_mcity_fisheye_filename(sample["filepath"]) 321 sample["location"] = metadata["location"] 322 sample["name"] = metadata["name"] 323 sample["timestamp"] = metadata["timestamp"] 324 325 _post_process_dataset(dataset) 326 327 return dataset 328 329 330def load_dataset_from_hf_hub(dataset_info): 331 """Loads a dataset from HuggingFace Hub or locally if it exists.""" 332 dataset_name = dataset_info["name"] 333 hf_dataset_name = dataset_info["hf_dataset_name"] 334 335 if dataset_name in fo.list_datasets(): 336 dataset = fo.load_dataset(dataset_name) 337 logging.info("Existing dataset " + dataset_name + " was loaded.") 338 else: 339 # Read API key for HF access 340 hf_token = None 341 try: 342 with open(".secret", "r") as f: 343 for line in f: 344 if line.startswith("HF_TOKEN="): 345 hf_token = line.split("=")[1].strip() 346 except FileNotFoundError: 347 logging.error( 348 "'.secret' file not found. Please create it to load private datasets." 349 ) 350 hf_token = None 351 352 if hf_token is None: 353 logging.error( 354 "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets." 355 ) 356 dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token) 357 _post_process_dataset(dataset) 358 359 return dataset 360 361 362def _process_mcity_fisheye_filename(filename): 363 """Processes a Mcity fisheye camera filename to extract location, name, and timestamp information.""" 364 365 filename = os.path.basename(filename) 366 results = {"filename": filename, "location": None, "name": None, "timestamp": None} 367 368 # TODO Check if some locations are duplicated (e.g. beal vs gs_Plymouth_Beal) 369 available_locations = [ 370 "beal", 371 "bishop", 372 "georgetown", 373 "gridsmart_ne", 374 "gridsmart_nw", 375 "gridsmart_se", 376 "gridsmart_sw", 377 "Huron_Plymouth-Geddes", 378 "Main_stadium", 379 "gs_Geddes_Huron", 380 "gs_Huron_Plymouth", 381 "gs_Plymouth_Beal", 382 "gs_Plymouth_Georgetown", 383 "gs_Plymouth_Bishop", 384 "gs_Plymouth_EPA", 385 ] 386 387 for location in available_locations: 388 if location in filename: 389 results["location"] = location 390 break 391 392 if results["location"] is None: 393 logging.error(f"Filename {filename} could not be assigned to a known location") 394 395 # Split string into first and second part based on first 4 digit year number 396 match = re.search(r"\d{4}", filename) 397 if match: 398 year_index = match.start() 399 part1 = filename[:year_index] 400 part2 = filename[year_index:] 401 402 # Cleanup first part 403 results["name"] = re.sub(r"[-_]+$", "", part1) 404 405 # Extract timestamp from second part 406 match = re.search(r"\d{8}T\d{6}|\d{4}-\d{2}-\d{2}[_ ]\d{2}-\d{2}-\d{2}", part2) 407 if match: 408 extracted_timestamp = match.group(0) 409 410 if re.match(r"\d{8}T\d{6}", extracted_timestamp): 411 results["timestamp"] = datetime.datetime.strptime( 412 extracted_timestamp, "%Y%m%dT%H%M%S" 413 ) 414 elif re.match(r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}", extracted_timestamp): 415 results["timestamp"] = datetime.datetime.strptime( 416 extracted_timestamp, "%Y-%m-%d_%H-%M-%S" 417 ) 418 elif re.match(r"\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}", extracted_timestamp): 419 results["timestamp"] = datetime.datetime.strptime( 420 extracted_timestamp, "%Y-%m-%d %H-%M-%S" 421 ) 422 else: 423 logging.error(f"Unknown timestamp format: {match}") 424 else: 425 logging.error(f"No valid timestamp found in string: {part2}") 426 427 return results 428 429 430def load_mcity_fisheye_3_months(dataset_info): 431 """Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info.""" 432 433 dataset_name = dataset_info["name"] 434 dataset_dir = dataset_info["local_path"] 435 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 436 dataset_splits = dataset_info["v51_splits"] # Use all available splits 437 438 if dataset_name in fo.list_datasets(): 439 dataset = fo.load_dataset(dataset_name) 440 logging.info("Existing dataset " + dataset_name + " was loaded.") 441 else: 442 dataset = fo.Dataset(dataset_name) 443 for split in dataset_splits: 444 dataset.add_dir( 445 dataset_dir=dataset_dir, 446 dataset_type=dataset_type, 447 split=split, 448 tags=split, 449 ) 450 451 # Add dataset specific metedata based on filename 452 for sample in dataset.iter_samples(progress=True, autosave=True): 453 metadata = _process_mcity_fisheye_filename(sample["filepath"]) 454 sample["location"] = metadata["location"] 455 sample["name"] = metadata["name"] 456 sample["timestamp"] = metadata["timestamp"] 457 458 _post_process_dataset(dataset) 459 460 return dataset 461 462 463def load_fisheye_8k(dataset_info): 464 """Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally.""" 465 466 dataset_name = dataset_info["name"] 467 hf_dataset_name = dataset_info["hf_dataset_name"] 468 469 if dataset_name in fo.list_datasets(): 470 dataset = fo.load_dataset(dataset_name) 471 logging.info("Existing dataset " + dataset_name + " was loaded.") 472 else: 473 dataset = load_from_hub(hf_dataset_name, name=dataset_name) 474 _post_process_dataset(dataset) 475 476 return dataset 477 478 479def load_mars_multiagent(dataset_info): 480 """Load the MARS multi-agent dataset from Hugging Face.""" 481 hugging_face_id = "ai4ce/MARS/Multiagent_53scene" 482 483 dataset = None # TODO Implement loading 484 _post_process_dataset(dataset) 485 486 return dataset 487 488 489def load_mars_multitraversal(dataset_info): 490 """Loads and post-processes multi-traversal MARS dataset from specified location.""" 491 location = 10 492 data_root = "./datasets/MARS/Multitraversal_2023_10_04-2024_03_08" 493 nusc = NuScenes(version="v1.0", dataroot=f"data_root/{location}", verbose=True) 494 495 dataset = None # TODO Implement loading 496 _post_process_dataset(dataset) 497 498 return dataset 499 500def load_sunrgbd(dataset_info): 501 dataset_name = dataset_info["name"] 502 dataset_root = dataset_info["local_path"] 503 504 if dataset_name in fo.list_datasets(): 505 dataset = fo.load_dataset(dataset_name) 506 logging.info(f"Existing dataset {dataset_name} was loaded.") 507 508 else: 509 dataset = fo.Dataset(dataset_name) 510 511 scene_dirs = glob(os.path.join(dataset_root, "k*/*/*")) 512 samples = [] 513 for scene_dir in scene_dirs: 514 image_files = glob(os.path.join(scene_dir, "image", "*")) 515 depth_files = glob(os.path.join(scene_dir, "depth_bfx", "*")) 516 517 if not image_files or not depth_files: 518 continue 519 520 image_path = image_files[0] 521 depth_path = depth_files[0] 522 523 depth_map = np.array(Image.open(depth_path)) 524 if depth_map.max() > 0: 525 depth_map = (depth_map * 255 / depth_map.max()).astype("uint8") 526 527 sample = fo.Sample( 528 filepath=image_path, 529 gt_depth=fo.Heatmap(map=depth_map), 530 ) 531 samples.append(sample) 532 533 dataset.add_samples(samples) 534 dataset = _post_process_dataset(dataset) 535 536 return dataset
21def get_supported_datasets(config_path="config/datasets.yaml"): 22 """Returns a list of supported dataset names from the config file.""" 23 try: 24 with open(config_path, "r") as file: 25 config = yaml.safe_load(file) 26 27 return [dataset["name"] for dataset in config["datasets"]] 28 except Exception as e: 29 logging.error(f"Available datasets could not be retrieved: {e}")
Returns a list of supported dataset names from the config file.
32def load_dataset(selected_dataset: str, n_iteration=0) -> fo.Dataset: 33 """Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions.""" 34 dataset_info = load_dataset_info(selected_dataset["name"]) 35 36 if dataset_info: 37 loader_function = dataset_info.get("loader_fct") 38 dataset = globals()[loader_function](dataset_info) 39 n_samples_original = len(dataset) 40 n_samples_requested = selected_dataset["n_samples"] 41 custom_view_requested = selected_dataset["custom_view"] 42 43 if ( 44 n_samples_requested is not None 45 and n_samples_requested <= n_samples_original 46 ): 47 logging.info("Dataset reduction in process.") 48 # Make sure that the reduced datasets has samples from every available split 49 split_views = [] 50 51 # Get split distribution 52 tags_count_dataset_dict = dataset.count_sample_tags() 53 for tag in tags_count_dataset_dict: 54 if tag in ACCEPTED_SPLITS: 55 count = tags_count_dataset_dict[tag] 56 percentage = count / n_samples_original 57 n_split_samples = int(n_samples_requested * percentage) 58 logging.info(f"Split {tag}: {n_split_samples} samples") 59 60 split_view = dataset.match_tags(tag).limit(n_split_samples) 61 split_views.append(split_view) 62 63 # Concatenate views properly 64 if split_views: 65 combined_view = split_views[0] 66 for view in split_views[1:]: 67 combined_view = combined_view.concat(view) 68 69 # Fill dataset if smaller than requested 70 if len(combined_view) < n_samples_requested: 71 n_samples_needed = n_samples_requested - len(combined_view) 72 view_random = dataset.take(n_samples_needed, seed=GLOBAL_SEED) 73 combined_view = combined_view.concat(view_random) 74 75 logging.warning( 76 f"Dataset size was reduced from {len(dataset)} to {len(combined_view)} samples." 77 ) 78 return combined_view, dataset_info 79 80 elif custom_view_requested is not None: 81 try: 82 logging.warning(f"Applying custom view {custom_view_requested}.") 83 dataset_view = globals()[custom_view_requested](dataset, n_iteration) 84 return dataset_view, dataset_info 85 except Exception as e: 86 logging.error( 87 f"Calling the custom view {custom_view_requested} failed: {e}" 88 ) 89 90 else: 91 logging.error( 92 str(selected_dataset["name"]) 93 + " is not a valid dataset name. Check supported datasets in datasets.yaml." 94 ) 95 96 return dataset, dataset_info
Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions.
99def get_split(v51_sample: Union[fo.core.sample.Sample, List[str]]) -> str: 100 """Gets dataset split (train, val, test) from a sample's tags or list of tags.""" 101 if isinstance(v51_sample, fo.core.sample.Sample): 102 sample_tags = v51_sample.tags 103 elif isinstance(v51_sample, list): 104 sample_tags = v51_sample 105 else: 106 logging.error( 107 f"Type {isinstance(v51_sample)} is not supported for split retrieval." 108 ) 109 110 found_splits = [split for split in ACCEPTED_SPLITS if split in sample_tags] 111 112 if len(found_splits) == 0: 113 logging.warning(f"No split found in sample tags: {sample_tags}") 114 return None 115 elif len(found_splits) > 1: 116 logging.warning(f"Multiple splits found in sample tags: '{found_splits}'") 117 return None 118 else: 119 split = found_splits[0] 120 return split
Gets dataset split (train, val, test) from a sample's tags or list of tags.
243def load_dataset_info(dataset_name, config_path="./config/datasets.yaml"): 244 """Load dataset information from a YAML configuration file.""" 245 logging.info(f"Currently active V51 datasets: {fo.list_datasets()}") 246 with open(config_path) as f: 247 datasets_config = yaml.safe_load(f) 248 249 datasets = datasets_config["datasets"] 250 dataset_info = next((ds for ds in datasets if ds["name"] == dataset_name), None) 251 252 if dataset_info: 253 return dataset_info 254 else: 255 return None
Load dataset information from a YAML configuration file.
258def load_annarbor_rolling(dataset_info): 259 """Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist.""" 260 dataset_name = dataset_info["name"] 261 dataset_dir = dataset_info["local_path"] 262 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 263 264 if dataset_name in fo.list_datasets(): 265 dataset = fo.load_dataset(dataset_name) 266 logging.info("Existing dataset " + dataset_name + " was loaded.") 267 else: 268 dataset = fo.Dataset(dataset_name) 269 dataset.add_dir( 270 dataset_dir=dataset_dir, 271 dataset_type=dataset_type, 272 ) 273 _post_process_dataset(dataset) 274 275 return dataset
Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist.
278def load_mcity_fisheye_2000(dataset_info): 279 """Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset.""" 280 dataset_name = dataset_info["name"] 281 dataset_dir = dataset_info["local_path"] 282 hf_dataset_name = dataset_info.get("hf_dataset_name", None) 283 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 284 dataset_splits = dataset_info["v51_splits"] 285 286 if dataset_name in fo.list_datasets(): 287 dataset = fo.load_dataset(dataset_name) 288 logging.info("Existing dataset " + dataset_name + " was loaded.") 289 elif hf_dataset_name is not None: 290 # Read API key for HF access 291 hf_token = None 292 try: 293 with open(".secret", "r") as f: 294 for line in f: 295 if line.startswith("HF_TOKEN="): 296 hf_token = line.split("=")[1].strip() 297 except FileNotFoundError: 298 logging.error( 299 "'.secret' file not found. Please create it to load private datasets." 300 ) 301 hf_token = None 302 303 if hf_token is None: 304 logging.error( 305 "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets." 306 ) 307 dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token) 308 _post_process_dataset(dataset) 309 else: 310 dataset = fo.Dataset(dataset_name) 311 for split in dataset_splits: 312 dataset.add_dir( 313 dataset_dir=dataset_dir, 314 dataset_type=dataset_type, 315 split=split, 316 tags=split, 317 ) 318 319 # Add dataset specific metadata based on filename 320 for sample in dataset.iter_samples(progress=True, autosave=True): 321 metadata = _process_mcity_fisheye_filename(sample["filepath"]) 322 sample["location"] = metadata["location"] 323 sample["name"] = metadata["name"] 324 sample["timestamp"] = metadata["timestamp"] 325 326 _post_process_dataset(dataset) 327 328 return dataset
Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset.
331def load_dataset_from_hf_hub(dataset_info): 332 """Loads a dataset from HuggingFace Hub or locally if it exists.""" 333 dataset_name = dataset_info["name"] 334 hf_dataset_name = dataset_info["hf_dataset_name"] 335 336 if dataset_name in fo.list_datasets(): 337 dataset = fo.load_dataset(dataset_name) 338 logging.info("Existing dataset " + dataset_name + " was loaded.") 339 else: 340 # Read API key for HF access 341 hf_token = None 342 try: 343 with open(".secret", "r") as f: 344 for line in f: 345 if line.startswith("HF_TOKEN="): 346 hf_token = line.split("=")[1].strip() 347 except FileNotFoundError: 348 logging.error( 349 "'.secret' file not found. Please create it to load private datasets." 350 ) 351 hf_token = None 352 353 if hf_token is None: 354 logging.error( 355 "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets." 356 ) 357 dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token) 358 _post_process_dataset(dataset) 359 360 return dataset
Loads a dataset from HuggingFace Hub or locally if it exists.
431def load_mcity_fisheye_3_months(dataset_info): 432 """Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info.""" 433 434 dataset_name = dataset_info["name"] 435 dataset_dir = dataset_info["local_path"] 436 dataset_type = getattr(fo.types, dataset_info["v51_type"]) 437 dataset_splits = dataset_info["v51_splits"] # Use all available splits 438 439 if dataset_name in fo.list_datasets(): 440 dataset = fo.load_dataset(dataset_name) 441 logging.info("Existing dataset " + dataset_name + " was loaded.") 442 else: 443 dataset = fo.Dataset(dataset_name) 444 for split in dataset_splits: 445 dataset.add_dir( 446 dataset_dir=dataset_dir, 447 dataset_type=dataset_type, 448 split=split, 449 tags=split, 450 ) 451 452 # Add dataset specific metedata based on filename 453 for sample in dataset.iter_samples(progress=True, autosave=True): 454 metadata = _process_mcity_fisheye_filename(sample["filepath"]) 455 sample["location"] = metadata["location"] 456 sample["name"] = metadata["name"] 457 sample["timestamp"] = metadata["timestamp"] 458 459 _post_process_dataset(dataset) 460 461 return dataset
Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info.
464def load_fisheye_8k(dataset_info): 465 """Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally.""" 466 467 dataset_name = dataset_info["name"] 468 hf_dataset_name = dataset_info["hf_dataset_name"] 469 470 if dataset_name in fo.list_datasets(): 471 dataset = fo.load_dataset(dataset_name) 472 logging.info("Existing dataset " + dataset_name + " was loaded.") 473 else: 474 dataset = load_from_hub(hf_dataset_name, name=dataset_name) 475 _post_process_dataset(dataset) 476 477 return dataset
Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally.
480def load_mars_multiagent(dataset_info): 481 """Load the MARS multi-agent dataset from Hugging Face.""" 482 hugging_face_id = "ai4ce/MARS/Multiagent_53scene" 483 484 dataset = None # TODO Implement loading 485 _post_process_dataset(dataset) 486 487 return dataset
Load the MARS multi-agent dataset from Hugging Face.
490def load_mars_multitraversal(dataset_info): 491 """Loads and post-processes multi-traversal MARS dataset from specified location.""" 492 location = 10 493 data_root = "./datasets/MARS/Multitraversal_2023_10_04-2024_03_08" 494 nusc = NuScenes(version="v1.0", dataroot=f"data_root/{location}", verbose=True) 495 496 dataset = None # TODO Implement loading 497 _post_process_dataset(dataset) 498 499 return dataset
Loads and post-processes multi-traversal MARS dataset from specified location.
501def load_sunrgbd(dataset_info): 502 dataset_name = dataset_info["name"] 503 dataset_root = dataset_info["local_path"] 504 505 if dataset_name in fo.list_datasets(): 506 dataset = fo.load_dataset(dataset_name) 507 logging.info(f"Existing dataset {dataset_name} was loaded.") 508 509 else: 510 dataset = fo.Dataset(dataset_name) 511 512 scene_dirs = glob(os.path.join(dataset_root, "k*/*/*")) 513 samples = [] 514 for scene_dir in scene_dirs: 515 image_files = glob(os.path.join(scene_dir, "image", "*")) 516 depth_files = glob(os.path.join(scene_dir, "depth_bfx", "*")) 517 518 if not image_files or not depth_files: 519 continue 520 521 image_path = image_files[0] 522 depth_path = depth_files[0] 523 524 depth_map = np.array(Image.open(depth_path)) 525 if depth_map.max() > 0: 526 depth_map = (depth_map * 255 / depth_map.max()).astype("uint8") 527 528 sample = fo.Sample( 529 filepath=image_path, 530 gt_depth=fo.Heatmap(map=depth_map), 531 ) 532 samples.append(sample) 533 534 dataset.add_samples(samples) 535 dataset = _post_process_dataset(dataset) 536 537 return dataset