utils.data_loader

  1# https://docs.python.org/2/library/multiprocessing.html#sharing-state-between-processes
  2# https://pytorch.org/docs/stable/multiprocessing.html
  3import logging
  4
  5import fiftyone.utils.coco as fouc
  6import torch
  7from datasets import Dataset, Split
  8from torch.multiprocessing import Manager
  9from torchvision.io import decode_image
 10from tqdm import tqdm
 11
 12from utils.dataset_loader import get_split
 13
 14
 15class FiftyOneTorchDatasetCOCO(torch.utils.data.Dataset):
 16    """
 17    A PyTorch Dataset class for loading and processing a FiftyOne dataset in COCO format.
 18    This class handles multiprocessing to allow loading data with num_workers > 0 and
 19    converts the dataset into a format compatible with PyTorch's DataLoader.
 20
 21    References:
 22        - https://github.com/voxel51/fiftyone-examples/blob/master/examples/pytorch_detection_training.ipynb
 23        - https://github.com/voxel51/fiftyone/issues/1302
 24        - https://github.com/pytorch/pytorch/issues/13246#issuecomment-905703662
 25        - https://github.com/pytorch/pytorch/issues/13246
 26        - https://github.com/pytorch/pytorch/issues/13246#issuecomment-715050814
 27    """
 28
 29    def __init__(self, fiftyone_dataset, transforms=None, gt_field="ground_truth"):
 30        """Initialize dataset from Voxel51 (fiftyone) dataset with optional transforms and ground truth field name."""
 31        logging.info(f"Collecting data for torch dataset conversion.")
 32        self.transforms = transforms
 33        try:
 34            self.classes = fiftyone_dataset.distinct(f"{gt_field}.detections.label")
 35        except Exception as e:
 36            logging.debug(f"Classes could not be found in dataset: {e}.")
 37            self.classes = []
 38        self.labels_map_rev = {c: i for i, c in enumerate(self.classes)}
 39        self.dataset_length = len(fiftyone_dataset)
 40
 41        # Multiprocessing for data loader with num_workers > 0
 42        # https://docs.python.org/3/library/multiprocessing.html
 43        # https://github.com/pytorch/pytorch/issues/13246#issuecomment-612396143
 44        manager = Manager()
 45        self.img_paths = manager.list()
 46        self.ids = manager.list()
 47        self.metadata = manager.list()
 48
 49        self.labels = manager.dict()
 50        self.splits = manager.dict()
 51
 52        # Use values() to directly get the required fields from the dataset
 53        img_paths = fiftyone_dataset.values("filepath")
 54        ids = fiftyone_dataset.values("id")
 55        metadata = fiftyone_dataset.values("metadata")
 56
 57        try:
 58            ground_truths = fiftyone_dataset.values(gt_field)
 59        except:
 60            logging.info(f"Voxel51 dataset has no field named '{gt_field}'")
 61            ground_truths = None
 62        tags = fiftyone_dataset.values("tags")
 63
 64        # Process all samples with values() in place of the loop
 65        for i, sample_id in tqdm(
 66            enumerate(ids),
 67            total=len(ids),
 68            desc="Generating torch dataset from Voxel51 dataset",
 69        ):
 70            self.img_paths.append(img_paths[i])
 71            self.ids.append(sample_id)  # Store the sample ID
 72            self.metadata.append(metadata[i])
 73
 74            # Extract labels and splits for each sample
 75            if (
 76                ground_truths and ground_truths[i]
 77            ):  # Check if the ground truth exists for the sample
 78                # Store detections as (top_left_x, top_left_y, width, height) in rel. coordinates between [0,1]
 79                self.labels[sample_id] = ground_truths[i].detections
 80            if tags[i]:  # Check if the tags exist for the sample
 81                self.splits[sample_id] = get_split(tags[i])
 82
 83    def __getitem__(self, idx):
 84        """Returns transformed image and its target dictionary containing bounding boxes, category IDs, image ID, areas and crowd flags."""
 85        img_path = self.img_paths[idx]
 86        img_id = self.ids[idx]
 87        metadata = self.metadata[idx]
 88        detections = self.labels.get(img_id, [])
 89        img = decode_image(img_path, mode="RGB")
 90        boxes = []
 91        labels = []
 92        area = []
 93        iscrowd = []
 94
 95        for det in detections:
 96            category_id = self.labels_map_rev[det.label]
 97            # https://docs.voxel51.com/api/fiftyone.utils.coco.html#fiftyone.utils.coco.COCOObject
 98            coco_obj = fouc.COCOObject.from_label(
 99                det,
100                metadata,
101                category_id=category_id,
102            )
103            x_min, y_min, w, h = coco_obj.bbox  # Absolute coordinates
104            boxes.append([x_min, y_min, w, h])
105            labels.append(coco_obj.category_id)
106            area.append(coco_obj.area)
107            iscrowd.append(coco_obj.iscrowd)
108
109        target = {
110            "bbox": torch.as_tensor(boxes, dtype=torch.float32),
111            "category_id": torch.as_tensor(labels, dtype=torch.int64),
112            "image_id": img_id,
113            "area": torch.as_tensor(area, dtype=torch.float32),
114            "iscrowd": torch.as_tensor(iscrowd, dtype=torch.int64),
115        }
116        if self.transforms:
117            img = self.transforms(img)
118        return img, target
119
120    def __getitems__(self, indices):
121        """Returns a list of items at the specified indices using __getitem__ for each index."""
122        return [self.__getitem__(idx) for idx in indices]
123
124    def __len__(self):
125        """Returns the total number of samples in the dataset."""
126        return self.dataset_length
127
128    def get_classes(self):
129        """Return the list of classes available in the dataset."""
130        return self.classes
131
132    def get_splits(self):
133        """Returns a set of all unique split labels in the dataset."""
134        return set(self.splits.values())
135
136
137class TorchToHFDatasetCOCO:
138    """Convert PyTorch COCO-style dataset to Hugging Face dataset format.
139
140    This class facilitates the conversion of PyTorch COCO-style datasets to the Hugging Face
141    dataset format, handling split management and data generation.
142    """
143
144    split_mapping = {
145        "train": Split.TRAIN,
146        "test": Split.TEST,
147        "validation": Split.VALIDATION,
148        "val": Split.VALIDATION,
149    }
150
151    def __init__(self, torch_dataset):
152        """Initialize a data loader wrapper around a PyTorch dataset."""
153        self.torch_dataset = torch_dataset
154
155    def convert(self):
156        """Converts a PyTorch dataset to a Hugging Face dataset dictionary with mapped splits."""
157        try:
158            default_split_hf = "test"
159            splits = self.torch_dataset.get_splits()
160            if len(splits) == 0:
161                logging.warning(
162                    f"Hugging Face Datasets expects splits, but none are provided. Setting '{default_split_hf}' as the default split."
163                )
164                splits = [default_split_hf]
165            hf_dataset = {
166                self.split_mapping[split]: Dataset.from_generator(
167                    gen_factory(self.torch_dataset, split, default_split_hf),
168                    split=self.split_mapping[split],
169                )
170                for split in splits
171            }
172            return hf_dataset
173        except Exception as e:
174            logging.error(
175                f"Error in dataset conversion from Torch to Hugging Face: {e}"
176            )
177
178
179def gen_factory(torch_dataset, split_name, default_split_hf):
180    """
181    Factory function to create a generator function for the Hugging Face dataset.
182
183    This function ensures that all objects used within the generator function are picklable.
184    The FiftyOne dataset is iterated to collect sample data, which is then used within the generator function.
185    """
186    img_paths = torch_dataset.img_paths
187    img_ids = torch_dataset.ids
188    splits = torch_dataset.splits
189    metadata = torch_dataset.metadata
190    labels = torch_dataset.labels
191    labels_map_rev = torch_dataset.labels_map_rev
192
193    def _gen():
194        """Yields dictionaries containing image paths, object targets, and dataset splits for each image in the dataset."""
195        for idx, (img_path, img_id) in enumerate(zip(img_paths, img_ids)):
196            split = splits.get(img_id, None)
197
198            # If no split is provided, set default split
199            if split is None:
200                split = default_split_hf
201
202            # Only select samples of the split we are currently looking for
203            if split != split_name:
204                continue
205
206            sample_data = {
207                "metadata": metadata[idx],
208                "detections": labels.get(img_id, None),
209            }
210            target = create_target(sample_data, labels_map_rev, idx)
211            yield {
212                "image_path": img_path,
213                "objects": target,
214                "split": split,
215            }
216
217    return _gen
218
219
220def create_target(sample_data, labels_map_rev, idx, convert_to_coco=True):
221    """Convert detection data to COCO format, transforming relative coordinates to absolute if specified."""
222
223    detections = sample_data.get("detections", [])
224    img_width = sample_data["metadata"]["width"]
225    img_height = sample_data["metadata"]["height"]
226
227    # Handle empty or missing detections
228    if not detections:
229        logging.warning(f"No detections found for sample {idx}")
230        return {
231            "bbox": [],
232            "category_id": [],
233            "image_id": idx,
234            "area": [],
235            "iscrowd": [],
236        }
237
238    boxes = []
239    areas = []
240
241    if convert_to_coco:
242        # From rel. coordinates between [0,1] to abs. coordinates (COCO)
243        for det in detections:
244            x_min = det.bounding_box[0] * img_width
245            y_min = det.bounding_box[1] * img_height
246            width = det.bounding_box[2] * img_width
247            height = det.bounding_box[3] * img_height
248            boxes.append([x_min, y_min, width, height])
249            area = width * height
250            areas.append(area)
251    else:
252        boxes = [det.bounding_box for det in detections]
253        areas = [det.bounding_box[2] * det.bounding_box[3] for det in detections]
254
255    labels = [labels_map_rev[det.label] for det in detections]
256    iscrowd = [0 for _ in detections]
257
258    return {
259        "bbox": boxes,
260        "category_id": labels,
261        "image_id": idx,
262        "area": areas,
263        "iscrowd": iscrowd,
264    }
class FiftyOneTorchDatasetCOCO(typing.Generic[+_T_co]):
 16class FiftyOneTorchDatasetCOCO(torch.utils.data.Dataset):
 17    """
 18    A PyTorch Dataset class for loading and processing a FiftyOne dataset in COCO format.
 19    This class handles multiprocessing to allow loading data with num_workers > 0 and
 20    converts the dataset into a format compatible with PyTorch's DataLoader.
 21
 22    References:
 23        - https://github.com/voxel51/fiftyone-examples/blob/master/examples/pytorch_detection_training.ipynb
 24        - https://github.com/voxel51/fiftyone/issues/1302
 25        - https://github.com/pytorch/pytorch/issues/13246#issuecomment-905703662
 26        - https://github.com/pytorch/pytorch/issues/13246
 27        - https://github.com/pytorch/pytorch/issues/13246#issuecomment-715050814
 28    """
 29
 30    def __init__(self, fiftyone_dataset, transforms=None, gt_field="ground_truth"):
 31        """Initialize dataset from Voxel51 (fiftyone) dataset with optional transforms and ground truth field name."""
 32        logging.info(f"Collecting data for torch dataset conversion.")
 33        self.transforms = transforms
 34        try:
 35            self.classes = fiftyone_dataset.distinct(f"{gt_field}.detections.label")
 36        except Exception as e:
 37            logging.debug(f"Classes could not be found in dataset: {e}.")
 38            self.classes = []
 39        self.labels_map_rev = {c: i for i, c in enumerate(self.classes)}
 40        self.dataset_length = len(fiftyone_dataset)
 41
 42        # Multiprocessing for data loader with num_workers > 0
 43        # https://docs.python.org/3/library/multiprocessing.html
 44        # https://github.com/pytorch/pytorch/issues/13246#issuecomment-612396143
 45        manager = Manager()
 46        self.img_paths = manager.list()
 47        self.ids = manager.list()
 48        self.metadata = manager.list()
 49
 50        self.labels = manager.dict()
 51        self.splits = manager.dict()
 52
 53        # Use values() to directly get the required fields from the dataset
 54        img_paths = fiftyone_dataset.values("filepath")
 55        ids = fiftyone_dataset.values("id")
 56        metadata = fiftyone_dataset.values("metadata")
 57
 58        try:
 59            ground_truths = fiftyone_dataset.values(gt_field)
 60        except:
 61            logging.info(f"Voxel51 dataset has no field named '{gt_field}'")
 62            ground_truths = None
 63        tags = fiftyone_dataset.values("tags")
 64
 65        # Process all samples with values() in place of the loop
 66        for i, sample_id in tqdm(
 67            enumerate(ids),
 68            total=len(ids),
 69            desc="Generating torch dataset from Voxel51 dataset",
 70        ):
 71            self.img_paths.append(img_paths[i])
 72            self.ids.append(sample_id)  # Store the sample ID
 73            self.metadata.append(metadata[i])
 74
 75            # Extract labels and splits for each sample
 76            if (
 77                ground_truths and ground_truths[i]
 78            ):  # Check if the ground truth exists for the sample
 79                # Store detections as (top_left_x, top_left_y, width, height) in rel. coordinates between [0,1]
 80                self.labels[sample_id] = ground_truths[i].detections
 81            if tags[i]:  # Check if the tags exist for the sample
 82                self.splits[sample_id] = get_split(tags[i])
 83
 84    def __getitem__(self, idx):
 85        """Returns transformed image and its target dictionary containing bounding boxes, category IDs, image ID, areas and crowd flags."""
 86        img_path = self.img_paths[idx]
 87        img_id = self.ids[idx]
 88        metadata = self.metadata[idx]
 89        detections = self.labels.get(img_id, [])
 90        img = decode_image(img_path, mode="RGB")
 91        boxes = []
 92        labels = []
 93        area = []
 94        iscrowd = []
 95
 96        for det in detections:
 97            category_id = self.labels_map_rev[det.label]
 98            # https://docs.voxel51.com/api/fiftyone.utils.coco.html#fiftyone.utils.coco.COCOObject
 99            coco_obj = fouc.COCOObject.from_label(
100                det,
101                metadata,
102                category_id=category_id,
103            )
104            x_min, y_min, w, h = coco_obj.bbox  # Absolute coordinates
105            boxes.append([x_min, y_min, w, h])
106            labels.append(coco_obj.category_id)
107            area.append(coco_obj.area)
108            iscrowd.append(coco_obj.iscrowd)
109
110        target = {
111            "bbox": torch.as_tensor(boxes, dtype=torch.float32),
112            "category_id": torch.as_tensor(labels, dtype=torch.int64),
113            "image_id": img_id,
114            "area": torch.as_tensor(area, dtype=torch.float32),
115            "iscrowd": torch.as_tensor(iscrowd, dtype=torch.int64),
116        }
117        if self.transforms:
118            img = self.transforms(img)
119        return img, target
120
121    def __getitems__(self, indices):
122        """Returns a list of items at the specified indices using __getitem__ for each index."""
123        return [self.__getitem__(idx) for idx in indices]
124
125    def __len__(self):
126        """Returns the total number of samples in the dataset."""
127        return self.dataset_length
128
129    def get_classes(self):
130        """Return the list of classes available in the dataset."""
131        return self.classes
132
133    def get_splits(self):
134        """Returns a set of all unique split labels in the dataset."""
135        return set(self.splits.values())

A PyTorch Dataset class for loading and processing a FiftyOne dataset in COCO format. This class handles multiprocessing to allow loading data with num_workers > 0 and converts the dataset into a format compatible with PyTorch's DataLoader.

References: - https://github.com/voxel51/fiftyone-examples/blob/master/examples/pytorch_detection_training.ipynb - https://github.com/voxel51/fiftyone/issues/1302 - https://github.com/pytorch/pytorch/issues/13246#issuecomment-905703662 - https://github.com/pytorch/pytorch/issues/13246 - https://github.com/pytorch/pytorch/issues/13246#issuecomment-715050814

FiftyOneTorchDatasetCOCO(fiftyone_dataset, transforms=None, gt_field='ground_truth')
30    def __init__(self, fiftyone_dataset, transforms=None, gt_field="ground_truth"):
31        """Initialize dataset from Voxel51 (fiftyone) dataset with optional transforms and ground truth field name."""
32        logging.info(f"Collecting data for torch dataset conversion.")
33        self.transforms = transforms
34        try:
35            self.classes = fiftyone_dataset.distinct(f"{gt_field}.detections.label")
36        except Exception as e:
37            logging.debug(f"Classes could not be found in dataset: {e}.")
38            self.classes = []
39        self.labels_map_rev = {c: i for i, c in enumerate(self.classes)}
40        self.dataset_length = len(fiftyone_dataset)
41
42        # Multiprocessing for data loader with num_workers > 0
43        # https://docs.python.org/3/library/multiprocessing.html
44        # https://github.com/pytorch/pytorch/issues/13246#issuecomment-612396143
45        manager = Manager()
46        self.img_paths = manager.list()
47        self.ids = manager.list()
48        self.metadata = manager.list()
49
50        self.labels = manager.dict()
51        self.splits = manager.dict()
52
53        # Use values() to directly get the required fields from the dataset
54        img_paths = fiftyone_dataset.values("filepath")
55        ids = fiftyone_dataset.values("id")
56        metadata = fiftyone_dataset.values("metadata")
57
58        try:
59            ground_truths = fiftyone_dataset.values(gt_field)
60        except:
61            logging.info(f"Voxel51 dataset has no field named '{gt_field}'")
62            ground_truths = None
63        tags = fiftyone_dataset.values("tags")
64
65        # Process all samples with values() in place of the loop
66        for i, sample_id in tqdm(
67            enumerate(ids),
68            total=len(ids),
69            desc="Generating torch dataset from Voxel51 dataset",
70        ):
71            self.img_paths.append(img_paths[i])
72            self.ids.append(sample_id)  # Store the sample ID
73            self.metadata.append(metadata[i])
74
75            # Extract labels and splits for each sample
76            if (
77                ground_truths and ground_truths[i]
78            ):  # Check if the ground truth exists for the sample
79                # Store detections as (top_left_x, top_left_y, width, height) in rel. coordinates between [0,1]
80                self.labels[sample_id] = ground_truths[i].detections
81            if tags[i]:  # Check if the tags exist for the sample
82                self.splits[sample_id] = get_split(tags[i])

Initialize dataset from Voxel51 (fiftyone) dataset with optional transforms and ground truth field name.

transforms
labels_map_rev
dataset_length
img_paths
ids
metadata
labels
splits
def get_classes(self):
129    def get_classes(self):
130        """Return the list of classes available in the dataset."""
131        return self.classes

Return the list of classes available in the dataset.

def get_splits(self):
133    def get_splits(self):
134        """Returns a set of all unique split labels in the dataset."""
135        return set(self.splits.values())

Returns a set of all unique split labels in the dataset.

class TorchToHFDatasetCOCO:
138class TorchToHFDatasetCOCO:
139    """Convert PyTorch COCO-style dataset to Hugging Face dataset format.
140
141    This class facilitates the conversion of PyTorch COCO-style datasets to the Hugging Face
142    dataset format, handling split management and data generation.
143    """
144
145    split_mapping = {
146        "train": Split.TRAIN,
147        "test": Split.TEST,
148        "validation": Split.VALIDATION,
149        "val": Split.VALIDATION,
150    }
151
152    def __init__(self, torch_dataset):
153        """Initialize a data loader wrapper around a PyTorch dataset."""
154        self.torch_dataset = torch_dataset
155
156    def convert(self):
157        """Converts a PyTorch dataset to a Hugging Face dataset dictionary with mapped splits."""
158        try:
159            default_split_hf = "test"
160            splits = self.torch_dataset.get_splits()
161            if len(splits) == 0:
162                logging.warning(
163                    f"Hugging Face Datasets expects splits, but none are provided. Setting '{default_split_hf}' as the default split."
164                )
165                splits = [default_split_hf]
166            hf_dataset = {
167                self.split_mapping[split]: Dataset.from_generator(
168                    gen_factory(self.torch_dataset, split, default_split_hf),
169                    split=self.split_mapping[split],
170                )
171                for split in splits
172            }
173            return hf_dataset
174        except Exception as e:
175            logging.error(
176                f"Error in dataset conversion from Torch to Hugging Face: {e}"
177            )

Convert PyTorch COCO-style dataset to Hugging Face dataset format.

This class facilitates the conversion of PyTorch COCO-style datasets to the Hugging Face dataset format, handling split management and data generation.

TorchToHFDatasetCOCO(torch_dataset)
152    def __init__(self, torch_dataset):
153        """Initialize a data loader wrapper around a PyTorch dataset."""
154        self.torch_dataset = torch_dataset

Initialize a data loader wrapper around a PyTorch dataset.

split_mapping = {'train': NamedSplit('train'), 'test': NamedSplit('test'), 'validation': NamedSplit('validation'), 'val': NamedSplit('validation')}
torch_dataset
def convert(self):
156    def convert(self):
157        """Converts a PyTorch dataset to a Hugging Face dataset dictionary with mapped splits."""
158        try:
159            default_split_hf = "test"
160            splits = self.torch_dataset.get_splits()
161            if len(splits) == 0:
162                logging.warning(
163                    f"Hugging Face Datasets expects splits, but none are provided. Setting '{default_split_hf}' as the default split."
164                )
165                splits = [default_split_hf]
166            hf_dataset = {
167                self.split_mapping[split]: Dataset.from_generator(
168                    gen_factory(self.torch_dataset, split, default_split_hf),
169                    split=self.split_mapping[split],
170                )
171                for split in splits
172            }
173            return hf_dataset
174        except Exception as e:
175            logging.error(
176                f"Error in dataset conversion from Torch to Hugging Face: {e}"
177            )

Converts a PyTorch dataset to a Hugging Face dataset dictionary with mapped splits.

def gen_factory(torch_dataset, split_name, default_split_hf):
180def gen_factory(torch_dataset, split_name, default_split_hf):
181    """
182    Factory function to create a generator function for the Hugging Face dataset.
183
184    This function ensures that all objects used within the generator function are picklable.
185    The FiftyOne dataset is iterated to collect sample data, which is then used within the generator function.
186    """
187    img_paths = torch_dataset.img_paths
188    img_ids = torch_dataset.ids
189    splits = torch_dataset.splits
190    metadata = torch_dataset.metadata
191    labels = torch_dataset.labels
192    labels_map_rev = torch_dataset.labels_map_rev
193
194    def _gen():
195        """Yields dictionaries containing image paths, object targets, and dataset splits for each image in the dataset."""
196        for idx, (img_path, img_id) in enumerate(zip(img_paths, img_ids)):
197            split = splits.get(img_id, None)
198
199            # If no split is provided, set default split
200            if split is None:
201                split = default_split_hf
202
203            # Only select samples of the split we are currently looking for
204            if split != split_name:
205                continue
206
207            sample_data = {
208                "metadata": metadata[idx],
209                "detections": labels.get(img_id, None),
210            }
211            target = create_target(sample_data, labels_map_rev, idx)
212            yield {
213                "image_path": img_path,
214                "objects": target,
215                "split": split,
216            }
217
218    return _gen

Factory function to create a generator function for the Hugging Face dataset.

This function ensures that all objects used within the generator function are picklable. The FiftyOne dataset is iterated to collect sample data, which is then used within the generator function.

def create_target(sample_data, labels_map_rev, idx, convert_to_coco=True):
221def create_target(sample_data, labels_map_rev, idx, convert_to_coco=True):
222    """Convert detection data to COCO format, transforming relative coordinates to absolute if specified."""
223
224    detections = sample_data.get("detections", [])
225    img_width = sample_data["metadata"]["width"]
226    img_height = sample_data["metadata"]["height"]
227
228    # Handle empty or missing detections
229    if not detections:
230        logging.warning(f"No detections found for sample {idx}")
231        return {
232            "bbox": [],
233            "category_id": [],
234            "image_id": idx,
235            "area": [],
236            "iscrowd": [],
237        }
238
239    boxes = []
240    areas = []
241
242    if convert_to_coco:
243        # From rel. coordinates between [0,1] to abs. coordinates (COCO)
244        for det in detections:
245            x_min = det.bounding_box[0] * img_width
246            y_min = det.bounding_box[1] * img_height
247            width = det.bounding_box[2] * img_width
248            height = det.bounding_box[3] * img_height
249            boxes.append([x_min, y_min, width, height])
250            area = width * height
251            areas.append(area)
252    else:
253        boxes = [det.bounding_box for det in detections]
254        areas = [det.bounding_box[2] * det.bounding_box[3] for det in detections]
255
256    labels = [labels_map_rev[det.label] for det in detections]
257    iscrowd = [0 for _ in detections]
258
259    return {
260        "bbox": boxes,
261        "category_id": labels,
262        "image_id": idx,
263        "area": areas,
264        "iscrowd": iscrowd,
265    }

Convert detection data to COCO format, transforming relative coordinates to absolute if specified.