utils.dataset_loader

  1import datetime
  2import logging
  3import os
  4import re
  5from typing import List, Union
  6from glob import glob
  7import numpy as np
  8from PIL import Image
  9
 10import fiftyone as fo
 11import yaml
 12from fiftyone.utils.huggingface import load_from_hub
 13from nuscenes.nuscenes import NuScenes
 14
 15from config.config import ACCEPTED_SPLITS, GLOBAL_SEED, NUM_WORKERS, PERSISTENT
 16from utils.custom_view import max_detections, subset_splits, vru_mcity_fisheye
 17from utils.sample_field_operations import rename_sample_field
 18
 19
 20def get_supported_datasets(config_path="config/datasets.yaml"):
 21    """Returns a list of supported dataset names from the config file."""
 22    try:
 23        with open(config_path, "r") as file:
 24            config = yaml.safe_load(file)
 25
 26        return [dataset["name"] for dataset in config["datasets"]]
 27    except Exception as e:
 28        logging.error(f"Available datasets could not be retrieved: {e}")
 29
 30
 31def load_dataset(selected_dataset: str, n_iteration=0) -> fo.Dataset:
 32    """Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions."""
 33    dataset_info = load_dataset_info(selected_dataset["name"])
 34
 35    if dataset_info:
 36        loader_function = dataset_info.get("loader_fct")
 37        dataset = globals()[loader_function](dataset_info)
 38        n_samples_original = len(dataset)
 39        n_samples_requested = selected_dataset["n_samples"]
 40        custom_view_requested = selected_dataset["custom_view"]
 41
 42        if (
 43            n_samples_requested is not None
 44            and n_samples_requested <= n_samples_original
 45        ):
 46            logging.info("Dataset reduction in process.")
 47            # Make sure that the reduced datasets has samples from every available split
 48            split_views = []
 49
 50            # Get split distribution
 51            tags_count_dataset_dict = dataset.count_sample_tags()
 52            for tag in tags_count_dataset_dict:
 53                if tag in ACCEPTED_SPLITS:
 54                    count = tags_count_dataset_dict[tag]
 55                    percentage = count / n_samples_original
 56                    n_split_samples = int(n_samples_requested * percentage)
 57                    logging.info(f"Split {tag}: {n_split_samples} samples")
 58
 59                    split_view = dataset.match_tags(tag).limit(n_split_samples)
 60                    split_views.append(split_view)
 61
 62            # Concatenate views properly
 63            if split_views:
 64                combined_view = split_views[0]
 65                for view in split_views[1:]:
 66                    combined_view = combined_view.concat(view)
 67
 68                # Fill dataset if smaller than requested
 69                if len(combined_view) < n_samples_requested:
 70                    n_samples_needed = n_samples_requested - len(combined_view)
 71                    view_random = dataset.take(n_samples_needed, seed=GLOBAL_SEED)
 72                    combined_view = combined_view.concat(view_random)
 73
 74                logging.warning(
 75                    f"Dataset size was reduced from {len(dataset)} to {len(combined_view)} samples."
 76                )
 77                return combined_view, dataset_info
 78
 79        elif custom_view_requested is not None:
 80            try:
 81                logging.warning(f"Applying custom view {custom_view_requested}.")
 82                dataset_view = globals()[custom_view_requested](dataset, n_iteration)
 83                return dataset_view, dataset_info
 84            except Exception as e:
 85                logging.error(
 86                    f"Calling the custom view {custom_view_requested} failed: {e}"
 87                )
 88
 89    else:
 90        logging.error(
 91            str(selected_dataset["name"])
 92            + " is not a valid dataset name. Check supported datasets in datasets.yaml."
 93        )
 94
 95    return dataset, dataset_info
 96
 97
 98def get_split(v51_sample: Union[fo.core.sample.Sample, List[str]]) -> str:
 99    """Gets dataset split (train, val, test) from a sample's tags or list of tags."""
100    if isinstance(v51_sample, fo.core.sample.Sample):
101        sample_tags = v51_sample.tags
102    elif isinstance(v51_sample, list):
103        sample_tags = v51_sample
104    else:
105        logging.error(
106            f"Type {isinstance(v51_sample)} is not supported for split retrieval."
107        )
108
109    found_splits = [split for split in ACCEPTED_SPLITS if split in sample_tags]
110
111    if len(found_splits) == 0:
112        logging.warning(f"No split found in sample tags: {sample_tags}")
113        return None
114    elif len(found_splits) > 1:
115        logging.warning(f"Multiple splits found in sample tags: '{found_splits}'")
116        return None
117    else:
118        split = found_splits[0]
119        return split
120
121
122def _separate_split(dataset, current_split, new_split, split_ratio=2):
123    """Separates a portion of samples from the current split in the dataset and assigns them to a new split."""
124    # Select samples for split change
125    view_current_split = dataset.match_tags(current_split)
126    n_samples_current_split = len(view_current_split)
127    view_new_split = view_current_split.take(
128        int(n_samples_current_split / split_ratio), seed=GLOBAL_SEED
129    )
130    view_new_split.tag_samples(new_split)
131    view_new_split.untag_samples(current_split)
132
133    # Get number of samples in each split
134    view_current_split_changed = dataset.match_tags(current_split)
135    n_samples_current_split_changed = len(view_current_split_changed)
136    view_new_split = dataset.match_tags(new_split)
137    n_samples_new_split = len(view_new_split)
138
139    return n_samples_current_split, n_samples_current_split_changed, n_samples_new_split
140
141
142def _align_splits(dataset):
143    """Standardize dataset splits by renaming and creating missing splits (train/val/test) as needed."""
144    SUPPORTED_SPLITS = ["train", "training", "val", "validation", "test", "testing"]
145    tags = dataset.distinct("tags")
146    splits = [tag for tag in tags if tag in SUPPORTED_SPLITS]
147
148    # Rename splits if necessary
149    rename_mapping = {"training": "train", "validation": "val", "testing": "test"}
150
151    for old_tag, new_tag in rename_mapping.items():
152        if old_tag in splits:
153            dataset.rename_tag(old_tag, new_tag)
154            splits = [tag if tag != old_tag else new_tag for tag in splits]
155
156    # If only val or only test, create val and test splits
157    if "val" in splits and "test" not in splits:
158        (
159            n_samples_current_split,
160            n_samples_current_split_changed,
161            n_samples_new_split,
162        ) = _separate_split(dataset, current_split="val", new_split="test")
163        logging.warning(
164            f"Dataset had no 'test' split. Split {n_samples_current_split} 'val' into {n_samples_current_split_changed} 'val' and {n_samples_new_split} 'test'."
165        )
166
167    elif "test" in splits and "val" not in splits:
168        (
169            n_samples_current_split,
170            n_samples_current_split_changed,
171            n_samples_new_split,
172        ) = _separate_split(dataset, current_split="test", new_split="val")
173        logging.warning(
174            f"Dataset had no 'val' split. Split {n_samples_current_split} 'test' into {n_samples_current_split_changed} 'val' and {n_samples_new_split} 'test'."
175        )
176    if "train" in splits and "test" not in splits and "val" not in splits:
177        logging.warning(
178            "Found 'train' split, but 'test' and 'val' splits are missing. Training might fail."
179        )
180
181    # Logging of available splits
182    tags = dataset.distinct("tags")
183    splits = [tag for tag in tags if tag in ACCEPTED_SPLITS]
184    logging.info(f"Available splits: {splits}")
185
186    return splits
187
188
189def _align_ground_truth(dataset, gt_field="ground_truth"):
190    """Ensures dataset has ground truth field named correctly, renaming single label field if found."""
191
192    dataset_fields = dataset.get_field_schema()
193    if gt_field not in dataset_fields:
194        FIFTYONE_DEFAULT_FIELDS = [
195            "id",
196            "filepath",
197            "tags",
198            "metadata",
199            "created_at",
200            "last_modified_at",
201        ]
202        non_default_fields = {
203            k: v for k, v in dataset_fields.items() if k not in FIFTYONE_DEFAULT_FIELDS
204        }
205        label_fields = {
206            k: v
207            for k, v in non_default_fields.items()
208            if isinstance(v, fo.EmbeddedDocumentField)
209            and issubclass(v.document_type, fo.core.labels.Label)
210        }
211        if len(label_fields) == 1:
212            gt_label_old = next(iter(label_fields))
213            rename_sample_field(dataset, gt_label_old, gt_field)
214            logging.warning(
215                f"Label field '{gt_label_old}' renamed to '{gt_field}' for training."
216            )
217        elif len(label_fields) > 1:
218            logging.warning(
219                f"The dataset has {len(label_fields)} fields with detections: {label_fields}. Rename one to {gt_field} with the command 'dataset.rename_sample_field(<your_field>, {gt_field})' to use it for training."
220            )
221
222
223def _post_process_dataset(dataset):
224    """Post-processes the dataset by setting persistence, computing metadata, aligning splits, and aligning ground truth."""
225    logging.info(f"Running dataset post-processing.")
226    # Set persistance
227    # https://docs.voxel51.com/user_guide/using_datasets.html#dataset-persistence
228    dataset.persistent = PERSISTENT
229
230    # Compute metadata
231    dataset.compute_metadata(num_workers=NUM_WORKERS, overwrite=False, progress=True)
232
233    # Align split names
234    splits = _align_splits(dataset)
235
236    # Align ground truth field
237    _align_ground_truth(dataset)
238
239    return dataset
240
241
242def load_dataset_info(dataset_name, config_path="./config/datasets.yaml"):
243    """Load dataset information from a YAML configuration file."""
244    logging.info(f"Currently active V51 datasets: {fo.list_datasets()}")
245    with open(config_path) as f:
246        datasets_config = yaml.safe_load(f)
247
248    datasets = datasets_config["datasets"]
249    dataset_info = next((ds for ds in datasets if ds["name"] == dataset_name), None)
250
251    if dataset_info:
252        return dataset_info
253    else:
254        return None
255
256
257def load_annarbor_rolling(dataset_info):
258    """Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist."""
259    dataset_name = dataset_info["name"]
260    dataset_dir = dataset_info["local_path"]
261    dataset_type = getattr(fo.types, dataset_info["v51_type"])
262
263    if dataset_name in fo.list_datasets():
264        dataset = fo.load_dataset(dataset_name)
265        logging.info("Existing dataset " + dataset_name + " was loaded.")
266    else:
267        dataset = fo.Dataset(dataset_name)
268        dataset.add_dir(
269            dataset_dir=dataset_dir,
270            dataset_type=dataset_type,
271        )
272        _post_process_dataset(dataset)
273
274    return dataset
275
276
277def load_mcity_fisheye_2000(dataset_info):
278    """Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset."""
279    dataset_name = dataset_info["name"]
280    dataset_dir = dataset_info["local_path"]
281    hf_dataset_name = dataset_info.get("hf_dataset_name", None)
282    dataset_type = getattr(fo.types, dataset_info["v51_type"])
283    dataset_splits = dataset_info["v51_splits"]
284
285    if dataset_name in fo.list_datasets():
286        dataset = fo.load_dataset(dataset_name)
287        logging.info("Existing dataset " + dataset_name + " was loaded.")
288    elif hf_dataset_name is not None:
289        # Read API key for HF access
290        hf_token = None
291        try:
292            with open(".secret", "r") as f:
293                for line in f:
294                    if line.startswith("HF_TOKEN="):
295                        hf_token = line.split("=")[1].strip()
296        except FileNotFoundError:
297            logging.error(
298                "'.secret' file not found. Please create it to load private datasets."
299            )
300            hf_token = None
301
302        if hf_token is None:
303            logging.error(
304                "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets."
305            )
306        dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token)
307        _post_process_dataset(dataset)
308    else:
309        dataset = fo.Dataset(dataset_name)
310        for split in dataset_splits:
311            dataset.add_dir(
312                dataset_dir=dataset_dir,
313                dataset_type=dataset_type,
314                split=split,
315                tags=split,
316            )
317
318        # Add dataset specific metadata based on filename
319        for sample in dataset.iter_samples(progress=True, autosave=True):
320            metadata = _process_mcity_fisheye_filename(sample["filepath"])
321            sample["location"] = metadata["location"]
322            sample["name"] = metadata["name"]
323            sample["timestamp"] = metadata["timestamp"]
324
325        _post_process_dataset(dataset)
326
327    return dataset
328
329
330def load_dataset_from_hf_hub(dataset_info):
331    """Loads a dataset from HuggingFace Hub or locally if it exists."""
332    dataset_name = dataset_info["name"]
333    hf_dataset_name = dataset_info["hf_dataset_name"]
334
335    if dataset_name in fo.list_datasets():
336        dataset = fo.load_dataset(dataset_name)
337        logging.info("Existing dataset " + dataset_name + " was loaded.")
338    else:
339        # Read API key for HF access
340        hf_token = None
341        try:
342            with open(".secret", "r") as f:
343                for line in f:
344                    if line.startswith("HF_TOKEN="):
345                        hf_token = line.split("=")[1].strip()
346        except FileNotFoundError:
347            logging.error(
348                "'.secret' file not found. Please create it to load private datasets."
349            )
350            hf_token = None
351
352        if hf_token is None:
353            logging.error(
354                "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets."
355            )
356        dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token)
357        _post_process_dataset(dataset)
358
359    return dataset
360
361
362def _process_mcity_fisheye_filename(filename):
363    """Processes a Mcity fisheye camera filename to extract location, name, and timestamp information."""
364
365    filename = os.path.basename(filename)
366    results = {"filename": filename, "location": None, "name": None, "timestamp": None}
367
368    # TODO Check if some locations are duplicated (e.g. beal vs gs_Plymouth_Beal)
369    available_locations = [
370        "beal",
371        "bishop",
372        "georgetown",
373        "gridsmart_ne",
374        "gridsmart_nw",
375        "gridsmart_se",
376        "gridsmart_sw",
377        "Huron_Plymouth-Geddes",
378        "Main_stadium",
379        "gs_Geddes_Huron",
380        "gs_Huron_Plymouth",
381        "gs_Plymouth_Beal",
382        "gs_Plymouth_Georgetown",
383        "gs_Plymouth_Bishop",
384        "gs_Plymouth_EPA",
385    ]
386
387    for location in available_locations:
388        if location in filename:
389            results["location"] = location
390            break
391
392    if results["location"] is None:
393        logging.error(f"Filename {filename} could not be assigned to a known location")
394
395    # Split string into first and second part based on first 4 digit year number
396    match = re.search(r"\d{4}", filename)
397    if match:
398        year_index = match.start()
399        part1 = filename[:year_index]
400        part2 = filename[year_index:]
401
402    # Cleanup first part
403    results["name"] = re.sub(r"[-_]+$", "", part1)
404
405    # Extract timestamp from second part
406    match = re.search(r"\d{8}T\d{6}|\d{4}-\d{2}-\d{2}[_ ]\d{2}-\d{2}-\d{2}", part2)
407    if match:
408        extracted_timestamp = match.group(0)
409
410        if re.match(r"\d{8}T\d{6}", extracted_timestamp):
411            results["timestamp"] = datetime.datetime.strptime(
412                extracted_timestamp, "%Y%m%dT%H%M%S"
413            )
414        elif re.match(r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}", extracted_timestamp):
415            results["timestamp"] = datetime.datetime.strptime(
416                extracted_timestamp, "%Y-%m-%d_%H-%M-%S"
417            )
418        elif re.match(r"\d{4}-\d{2}-\d{2} \d{2}-\d{2}-\d{2}", extracted_timestamp):
419            results["timestamp"] = datetime.datetime.strptime(
420                extracted_timestamp, "%Y-%m-%d %H-%M-%S"
421            )
422        else:
423            logging.error(f"Unknown timestamp format: {match}")
424    else:
425        logging.error(f"No valid timestamp found in string: {part2}")
426
427    return results
428
429
430def load_mcity_fisheye_3_months(dataset_info):
431    """Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info."""
432
433    dataset_name = dataset_info["name"]
434    dataset_dir = dataset_info["local_path"]
435    dataset_type = getattr(fo.types, dataset_info["v51_type"])
436    dataset_splits = dataset_info["v51_splits"]  # Use all available splits
437
438    if dataset_name in fo.list_datasets():
439        dataset = fo.load_dataset(dataset_name)
440        logging.info("Existing dataset " + dataset_name + " was loaded.")
441    else:
442        dataset = fo.Dataset(dataset_name)
443        for split in dataset_splits:
444            dataset.add_dir(
445                dataset_dir=dataset_dir,
446                dataset_type=dataset_type,
447                split=split,
448                tags=split,
449            )
450
451        # Add dataset specific metedata based on filename
452        for sample in dataset.iter_samples(progress=True, autosave=True):
453            metadata = _process_mcity_fisheye_filename(sample["filepath"])
454            sample["location"] = metadata["location"]
455            sample["name"] = metadata["name"]
456            sample["timestamp"] = metadata["timestamp"]
457
458        _post_process_dataset(dataset)
459
460    return dataset
461
462
463def load_fisheye_8k(dataset_info):
464    """Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally."""
465
466    dataset_name = dataset_info["name"]
467    hf_dataset_name = dataset_info["hf_dataset_name"]
468
469    if dataset_name in fo.list_datasets():
470        dataset = fo.load_dataset(dataset_name)
471        logging.info("Existing dataset " + dataset_name + " was loaded.")
472    else:
473        dataset = load_from_hub(hf_dataset_name, name=dataset_name)
474        _post_process_dataset(dataset)
475
476    return dataset
477
478
479def load_mars_multiagent(dataset_info):
480    """Load the MARS multi-agent dataset from Hugging Face."""
481    hugging_face_id = "ai4ce/MARS/Multiagent_53scene"
482
483    dataset = None  # TODO Implement loading
484    _post_process_dataset(dataset)
485
486    return dataset
487
488
489def load_mars_multitraversal(dataset_info):
490    """Loads and post-processes multi-traversal MARS dataset from specified location."""
491    location = 10
492    data_root = "./datasets/MARS/Multitraversal_2023_10_04-2024_03_08"
493    nusc = NuScenes(version="v1.0", dataroot=f"data_root/{location}", verbose=True)
494
495    dataset = None  # TODO Implement loading
496    _post_process_dataset(dataset)
497
498    return dataset
499
500def load_sunrgbd(dataset_info):
501    dataset_name = dataset_info["name"]
502    dataset_root = dataset_info["local_path"]
503
504    if dataset_name in fo.list_datasets():
505        dataset = fo.load_dataset(dataset_name)
506        logging.info(f"Existing dataset {dataset_name} was loaded.")
507
508    else:
509        dataset = fo.Dataset(dataset_name)
510
511        scene_dirs = glob(os.path.join(dataset_root, "k*/*/*"))
512        samples = []
513        for scene_dir in scene_dirs:
514            image_files = glob(os.path.join(scene_dir, "image", "*"))
515            depth_files = glob(os.path.join(scene_dir, "depth_bfx", "*"))
516
517            if not image_files or not depth_files:
518                continue
519
520            image_path = image_files[0]
521            depth_path = depth_files[0]
522
523            depth_map = np.array(Image.open(depth_path))
524            if depth_map.max() > 0:
525                depth_map = (depth_map * 255 / depth_map.max()).astype("uint8")
526
527            sample = fo.Sample(
528                filepath=image_path,
529                gt_depth=fo.Heatmap(map=depth_map),
530            )
531            samples.append(sample)
532
533        dataset.add_samples(samples)
534        dataset = _post_process_dataset(dataset)
535
536    return dataset
def get_supported_datasets(config_path='config/datasets.yaml'):
21def get_supported_datasets(config_path="config/datasets.yaml"):
22    """Returns a list of supported dataset names from the config file."""
23    try:
24        with open(config_path, "r") as file:
25            config = yaml.safe_load(file)
26
27        return [dataset["name"] for dataset in config["datasets"]]
28    except Exception as e:
29        logging.error(f"Available datasets could not be retrieved: {e}")

Returns a list of supported dataset names from the config file.

def load_dataset(selected_dataset: str, n_iteration=0) -> fiftyone.core.dataset.Dataset:
32def load_dataset(selected_dataset: str, n_iteration=0) -> fo.Dataset:
33    """Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions."""
34    dataset_info = load_dataset_info(selected_dataset["name"])
35
36    if dataset_info:
37        loader_function = dataset_info.get("loader_fct")
38        dataset = globals()[loader_function](dataset_info)
39        n_samples_original = len(dataset)
40        n_samples_requested = selected_dataset["n_samples"]
41        custom_view_requested = selected_dataset["custom_view"]
42
43        if (
44            n_samples_requested is not None
45            and n_samples_requested <= n_samples_original
46        ):
47            logging.info("Dataset reduction in process.")
48            # Make sure that the reduced datasets has samples from every available split
49            split_views = []
50
51            # Get split distribution
52            tags_count_dataset_dict = dataset.count_sample_tags()
53            for tag in tags_count_dataset_dict:
54                if tag in ACCEPTED_SPLITS:
55                    count = tags_count_dataset_dict[tag]
56                    percentage = count / n_samples_original
57                    n_split_samples = int(n_samples_requested * percentage)
58                    logging.info(f"Split {tag}: {n_split_samples} samples")
59
60                    split_view = dataset.match_tags(tag).limit(n_split_samples)
61                    split_views.append(split_view)
62
63            # Concatenate views properly
64            if split_views:
65                combined_view = split_views[0]
66                for view in split_views[1:]:
67                    combined_view = combined_view.concat(view)
68
69                # Fill dataset if smaller than requested
70                if len(combined_view) < n_samples_requested:
71                    n_samples_needed = n_samples_requested - len(combined_view)
72                    view_random = dataset.take(n_samples_needed, seed=GLOBAL_SEED)
73                    combined_view = combined_view.concat(view_random)
74
75                logging.warning(
76                    f"Dataset size was reduced from {len(dataset)} to {len(combined_view)} samples."
77                )
78                return combined_view, dataset_info
79
80        elif custom_view_requested is not None:
81            try:
82                logging.warning(f"Applying custom view {custom_view_requested}.")
83                dataset_view = globals()[custom_view_requested](dataset, n_iteration)
84                return dataset_view, dataset_info
85            except Exception as e:
86                logging.error(
87                    f"Calling the custom view {custom_view_requested} failed: {e}"
88                )
89
90    else:
91        logging.error(
92            str(selected_dataset["name"])
93            + " is not a valid dataset name. Check supported datasets in datasets.yaml."
94        )
95
96    return dataset, dataset_info

Loads a dataset by name, optionally reducing it to a requested number of samples while maintaining original split distributions.

def get_split(v51_sample: Union[fiftyone.core.sample.Sample, List[str]]) -> str:
 99def get_split(v51_sample: Union[fo.core.sample.Sample, List[str]]) -> str:
100    """Gets dataset split (train, val, test) from a sample's tags or list of tags."""
101    if isinstance(v51_sample, fo.core.sample.Sample):
102        sample_tags = v51_sample.tags
103    elif isinstance(v51_sample, list):
104        sample_tags = v51_sample
105    else:
106        logging.error(
107            f"Type {isinstance(v51_sample)} is not supported for split retrieval."
108        )
109
110    found_splits = [split for split in ACCEPTED_SPLITS if split in sample_tags]
111
112    if len(found_splits) == 0:
113        logging.warning(f"No split found in sample tags: {sample_tags}")
114        return None
115    elif len(found_splits) > 1:
116        logging.warning(f"Multiple splits found in sample tags: '{found_splits}'")
117        return None
118    else:
119        split = found_splits[0]
120        return split

Gets dataset split (train, val, test) from a sample's tags or list of tags.

def load_dataset_info(dataset_name, config_path='./config/datasets.yaml'):
243def load_dataset_info(dataset_name, config_path="./config/datasets.yaml"):
244    """Load dataset information from a YAML configuration file."""
245    logging.info(f"Currently active V51 datasets: {fo.list_datasets()}")
246    with open(config_path) as f:
247        datasets_config = yaml.safe_load(f)
248
249    datasets = datasets_config["datasets"]
250    dataset_info = next((ds for ds in datasets if ds["name"] == dataset_name), None)
251
252    if dataset_info:
253        return dataset_info
254    else:
255        return None

Load dataset information from a YAML configuration file.

def load_annarbor_rolling(dataset_info):
258def load_annarbor_rolling(dataset_info):
259    """Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist."""
260    dataset_name = dataset_info["name"]
261    dataset_dir = dataset_info["local_path"]
262    dataset_type = getattr(fo.types, dataset_info["v51_type"])
263
264    if dataset_name in fo.list_datasets():
265        dataset = fo.load_dataset(dataset_name)
266        logging.info("Existing dataset " + dataset_name + " was loaded.")
267    else:
268        dataset = fo.Dataset(dataset_name)
269        dataset.add_dir(
270            dataset_dir=dataset_dir,
271            dataset_type=dataset_type,
272        )
273        _post_process_dataset(dataset)
274
275    return dataset

Loads the Ann Arbor rolling dataset from local storage into FiftyOne, creating a new dataset if it doesn't exist.

def load_mcity_fisheye_2000(dataset_info):
278def load_mcity_fisheye_2000(dataset_info):
279    """Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset."""
280    dataset_name = dataset_info["name"]
281    dataset_dir = dataset_info["local_path"]
282    hf_dataset_name = dataset_info.get("hf_dataset_name", None)
283    dataset_type = getattr(fo.types, dataset_info["v51_type"])
284    dataset_splits = dataset_info["v51_splits"]
285
286    if dataset_name in fo.list_datasets():
287        dataset = fo.load_dataset(dataset_name)
288        logging.info("Existing dataset " + dataset_name + " was loaded.")
289    elif hf_dataset_name is not None:
290        # Read API key for HF access
291        hf_token = None
292        try:
293            with open(".secret", "r") as f:
294                for line in f:
295                    if line.startswith("HF_TOKEN="):
296                        hf_token = line.split("=")[1].strip()
297        except FileNotFoundError:
298            logging.error(
299                "'.secret' file not found. Please create it to load private datasets."
300            )
301            hf_token = None
302
303        if hf_token is None:
304            logging.error(
305                "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets."
306            )
307        dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token)
308        _post_process_dataset(dataset)
309    else:
310        dataset = fo.Dataset(dataset_name)
311        for split in dataset_splits:
312            dataset.add_dir(
313                dataset_dir=dataset_dir,
314                dataset_type=dataset_type,
315                split=split,
316                tags=split,
317            )
318
319        # Add dataset specific metadata based on filename
320        for sample in dataset.iter_samples(progress=True, autosave=True):
321            metadata = _process_mcity_fisheye_filename(sample["filepath"])
322            sample["location"] = metadata["location"]
323            sample["name"] = metadata["name"]
324            sample["timestamp"] = metadata["timestamp"]
325
326        _post_process_dataset(dataset)
327
328    return dataset

Loads the MCityFisheye2000 dataset from local path or Hugging Face, creating or loading a FiftyOne dataset.

def load_dataset_from_hf_hub(dataset_info):
331def load_dataset_from_hf_hub(dataset_info):
332    """Loads a dataset from HuggingFace Hub or locally if it exists."""
333    dataset_name = dataset_info["name"]
334    hf_dataset_name = dataset_info["hf_dataset_name"]
335
336    if dataset_name in fo.list_datasets():
337        dataset = fo.load_dataset(dataset_name)
338        logging.info("Existing dataset " + dataset_name + " was loaded.")
339    else:
340        # Read API key for HF access
341        hf_token = None
342        try:
343            with open(".secret", "r") as f:
344                for line in f:
345                    if line.startswith("HF_TOKEN="):
346                        hf_token = line.split("=")[1].strip()
347        except FileNotFoundError:
348            logging.error(
349                "'.secret' file not found. Please create it to load private datasets."
350            )
351            hf_token = None
352
353        if hf_token is None:
354            logging.error(
355                "Provide your Hugging Face 'HF_TOKEN' in the .secret file to load private datasets."
356            )
357        dataset = load_from_hub(hf_dataset_name, name=dataset_name, token=hf_token)
358        _post_process_dataset(dataset)
359
360    return dataset

Loads a dataset from HuggingFace Hub or locally if it exists.

def load_mcity_fisheye_3_months(dataset_info):
431def load_mcity_fisheye_3_months(dataset_info):
432    """Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info."""
433
434    dataset_name = dataset_info["name"]
435    dataset_dir = dataset_info["local_path"]
436    dataset_type = getattr(fo.types, dataset_info["v51_type"])
437    dataset_splits = dataset_info["v51_splits"]  # Use all available splits
438
439    if dataset_name in fo.list_datasets():
440        dataset = fo.load_dataset(dataset_name)
441        logging.info("Existing dataset " + dataset_name + " was loaded.")
442    else:
443        dataset = fo.Dataset(dataset_name)
444        for split in dataset_splits:
445            dataset.add_dir(
446                dataset_dir=dataset_dir,
447                dataset_type=dataset_type,
448                split=split,
449                tags=split,
450            )
451
452        # Add dataset specific metedata based on filename
453        for sample in dataset.iter_samples(progress=True, autosave=True):
454            metadata = _process_mcity_fisheye_filename(sample["filepath"])
455            sample["location"] = metadata["location"]
456            sample["name"] = metadata["name"]
457            sample["timestamp"] = metadata["timestamp"]
458
459        _post_process_dataset(dataset)
460
461    return dataset

Loads or creates a FiftyOne dataset for the Mcity fisheye 3-month dataset using the provided dataset info.

def load_fisheye_8k(dataset_info):
464def load_fisheye_8k(dataset_info):
465    """Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally."""
466
467    dataset_name = dataset_info["name"]
468    hf_dataset_name = dataset_info["hf_dataset_name"]
469
470    if dataset_name in fo.list_datasets():
471        dataset = fo.load_dataset(dataset_name)
472        logging.info("Existing dataset " + dataset_name + " was loaded.")
473    else:
474        dataset = load_from_hub(hf_dataset_name, name=dataset_name)
475        _post_process_dataset(dataset)
476
477    return dataset

Loads a fisheye 8k dataset from FiftyOne, creating it from HuggingFace if it doesn't exist locally.

def load_mars_multiagent(dataset_info):
480def load_mars_multiagent(dataset_info):
481    """Load the MARS multi-agent dataset from Hugging Face."""
482    hugging_face_id = "ai4ce/MARS/Multiagent_53scene"
483
484    dataset = None  # TODO Implement loading
485    _post_process_dataset(dataset)
486
487    return dataset

Load the MARS multi-agent dataset from Hugging Face.

def load_mars_multitraversal(dataset_info):
490def load_mars_multitraversal(dataset_info):
491    """Loads and post-processes multi-traversal MARS dataset from specified location."""
492    location = 10
493    data_root = "./datasets/MARS/Multitraversal_2023_10_04-2024_03_08"
494    nusc = NuScenes(version="v1.0", dataroot=f"data_root/{location}", verbose=True)
495
496    dataset = None  # TODO Implement loading
497    _post_process_dataset(dataset)
498
499    return dataset

Loads and post-processes multi-traversal MARS dataset from specified location.

def load_sunrgbd(dataset_info):
501def load_sunrgbd(dataset_info):
502    dataset_name = dataset_info["name"]
503    dataset_root = dataset_info["local_path"]
504
505    if dataset_name in fo.list_datasets():
506        dataset = fo.load_dataset(dataset_name)
507        logging.info(f"Existing dataset {dataset_name} was loaded.")
508
509    else:
510        dataset = fo.Dataset(dataset_name)
511
512        scene_dirs = glob(os.path.join(dataset_root, "k*/*/*"))
513        samples = []
514        for scene_dir in scene_dirs:
515            image_files = glob(os.path.join(scene_dir, "image", "*"))
516            depth_files = glob(os.path.join(scene_dir, "depth_bfx", "*"))
517
518            if not image_files or not depth_files:
519                continue
520
521            image_path = image_files[0]
522            depth_path = depth_files[0]
523
524            depth_map = np.array(Image.open(depth_path))
525            if depth_map.max() > 0:
526                depth_map = (depth_map * 255 / depth_map.max()).astype("uint8")
527
528            sample = fo.Sample(
529                filepath=image_path,
530                gt_depth=fo.Heatmap(map=depth_map),
531            )
532            samples.append(sample)
533
534        dataset.add_samples(samples)
535        dataset = _post_process_dataset(dataset)
536
537    return dataset