config.config

  1import os
  2import psutil
  3
  4#: Select workflow list from 'WORKFLOWS = {...}' dictionary
  5SELECTED_WORKFLOW = ["auto_labeling"]  # Choose from WORKFLOWS keys
  6
  7#: Select dataset from config/datasets.yaml
  8SELECTED_DATASET = {
  9    "name": "fisheye8k",
 10    "n_samples": None,  # 'None' (full dataset) or 'int' (subset of the dataset)
 11    "custom_view": None,  # 'None' (full dataset) or select function from utils/custom_view
 12}
 13
 14#: Workflows and associated parameters
 15WORKFLOWS = {
 16    "aws_download": {
 17        "mcity": {
 18            "bucket": "mcity-data-engine",
 19            "prefix": "",
 20            "download_path": "output/datasets/annarbor_rolling",
 21            "test_run": True,
 22            "selected_dataset_overwrite": True,
 23        }
 24    },
 25    "embedding_selection": {
 26        "mode": "compute",  # "compute" or "load"
 27        "parameters": {
 28            "compute_representativeness": 0.99,
 29            "compute_unique_images_greedy": 0.01,
 30            "compute_unique_images_deterministic": 0.99,
 31            "compute_similar_images": 0.03,
 32            "neighbour_count": 3,
 33        },
 34        "embedding_models": [  # Select from V51 "Embeddings" models https://docs.voxel51.com/model_zoo/models.html
 35            "clip-vit-base32-torch",
 36            # "open-clip-torch",
 37            # "dinov2-vits14-torch",
 38            # "dinov2-vits14-reg-torch",
 39            # "mobilenet-v2-imagenet-torch",
 40            # "resnet152-imagenet-torch",
 41            # "vgg19-imagenet-torch",
 42            # "classification-transformer-torch",
 43            # "detection-transformer-torch",
 44            # "zero-shot-detection-transformer-torch",
 45            # "zero-shot-classification-transformer-torch",
 46        ],
 47    },
 48    "anomaly_detection": {
 49        "mode": ["train", "inference"],  # "train" and "inference" supported
 50        "epochs": 12,
 51        "early_stop_patience": 5,
 52        "anomalib_image_models": {  # Choose from https://anomalib.readthedocs.io/en/v1.2.0/markdown/guides/reference/models/image/index.html
 53            "Padim": {},
 54            # "EfficientAd": {},
 55            # "Draem": {},
 56            # "Cfa": {},
 57        },
 58        "anomalib_eval_metrics": [  # Choose from https://anomalib.readthedocs.io/en/v1.2.0/markdown/guides/reference/metrics/index.html. Focus on standard metrics, computation of others can be expensive
 59            "AUPR",
 60            "AUROC",
 61            "F1Max",
 62        ],
 63        "data_preparation": {"fisheye8k": {"location": "cam1", "rare_class": "Truck"}},
 64    },
 65    "auto_labeling": {
 66        "mode": ["train","inference"],  # "train" and "inference" supported
 67        "model_source": [
 68            # "hf_models_objectdetection",
 69            #"ultralytics",
 70            # "custom_codetr",
 71            "roboflow",
 72        ],
 73        "n_worker_dataloader": 8,
 74        "epochs": 1,
 75        "early_stop_patience": 0,
 76        "early_stop_threshold": 0,
 77        "learning_rate": 5e-05,
 78        "weight_decay": 0.0001,
 79        "max_grad_norm": 0.01,
 80        "inference_settings": {
 81            "do_eval": True,
 82            "inference_on_test": True,
 83            "model_hf": None,  # None (automatic selection) or overwrite with Hugging Face ID. Assumes same model as selected below.
 84            "detection_threshold": 0.2,
 85        },
 86        "hf_models_objectdetection": {  # HF Leaderboard: https://huggingface.co/spaces/hf-vision/object_detection_leaderboard
 87            # "microsoft/conditional-detr-resnet-50": {"batch_size": 4},
 88            # "Omnifact/conditional-detr-resnet-101-dc5": {"batch_size": 1},
 89            # "facebook/detr-resnet-50": {"batch_size": 1},
 90            # "facebook/detr-resnet-50-dc5": {"batch_size": 1, "image_size": [960, 960]},
 91            # "facebook/detr-resnet-101": {"batch_size": 4, "image_size": [960, 960]},
 92            # "facebook/detr-resnet-101-dc5": {"batch_size": 1, "image_size": [960, 960]},
 93            # "facebook/deformable-detr-detic": {
 94            #    "batch_size": 4,
 95            #    "image_size": [960, 960],
 96            # },
 97            # "facebook/deformable-detr-box-supervised": {
 98            #   "batch_size": 1,
 99            #   "image_size": [960, 960],
100            # },
101            # "SenseTime/deformable-detr": {"batch_size": 4, "image_size": [960, 960]},
102            # "SenseTime/deformable-detr-with-box-refine": {
103            #   "batch_size": 1,
104            #   "image_size": [960, 960],
105            # },
106            # "jozhang97/deta-swin-large": {
107            #   "batch_size": 1,
108            #   "image_size": [960, 960],
109            # },
110            # "jozhang97/deta-swin-large-o365": {
111            #    "batch_size": 4,
112            #    "image_size": [960, 960],
113            # },
114            # "hustvl/yolos-base": {"batch_size": 4},
115            "IDEA-Research/dab-detr-resnet-50": {
116                "batch_size": 4,
117                "image_size": [960, 960],
118            },
119            # "PekingU/rtdetr_v2_r18vd": {
120            #    "batch_size": 4,
121            #    "image_size": [960, 960],
122            # },
123        },
124        "custom_codetr": {
125            "export_dataset_root": "output/datasets/codetr_data/",
126            "configs": [
127                "projects/configs/co_deformable_detr/co_deformable_detr_r50_1x_coco.py",
128                "projects/configs/co_dino_vit/co_dino_5scale_vit_large_coco.py",
129            ],
130            "n_gpus": "1",
131            "container_tool": "docker",
132        },
133        "roboflow": {  # Roboflow RF-DETR configuration
134            "export_dataset_root": "output/datasets/roboflow_data/",
135            "configs": [
136                "rfdetr_nano",
137                "rfdetr_small",
138                #"rfdetr_medium",
139                #"rfdetr_large",
140            ],
141            # RF-DETR specific parameters only
142            "batch_size": 4,                     # Override default batch size
143            "grad_accum_steps": 4,                # Gradient accumulation steps
144            "lr_encoder": None,                   # Encoder-specific learning rate (optional)
145            "resolution": None,                   # Image resolution, must be divisible by 56 (optional)
146            "use_ema": True,                      # Exponential moving average
147            "gradient_checkpointing": False,      # Memory optimization
148            "early_stopping_min_delta": 0.001,    # Minimum improvement for early stopping
149            "early_stopping_use_ema": True,       # Use EMA model for early stopping
150        },
151        "ultralytics": {
152            "export_dataset_root": "output/datasets/ultralytics_data/",
153            "multi_scale": False,
154            "cos_lr": True,
155            "models": {  # Pick from https://docs.ultralytics.com/models/
156                # "yolo11n": {"batch_size": 8, "img_size": 1280},
157                # "yolo11x": {"batch_size": 1, "img_size": 960},
158                "yolo12n": {"batch_size": 8, "img_size": 1280},
159                # "yolo12x": {"batch_size": 1, "img_size": 960},
160            },
161        },
162    },
163    "auto_labeling_zero_shot": {
164        "n_post_processing_worker_per_inference_worker": 5,
165        "n_worker_dataloader": 3,
166        "prefetch_factor_dataloader": 2,
167        "hf_models_zeroshot_objectdetection": {
168            "omlab/omdet-turbo-swin-tiny-hf": {  # https://huggingface.co/models?pipeline_tag=zero-shot-object-detection&sort=trending&search=omlab%2Fomdet
169                "batch_size": 1,
170                "n_dataset_chunks": 1,  # Number of chunks to split the dataset into for parallel processing
171            },
172            "IDEA-Research/grounding-dino-tiny": {  # https://huggingface.co/models?pipeline_tag=zero-shot-object-detection&sort=trending&search=IDEA-Research%2Fgrounding
173                "batch_size": 1,
174                "n_dataset_chunks": 1,
175            },
176            "google/owlvit-large-patch14": {  # https://huggingface.co/models?pipeline_tag=zero-shot-object-detection&sort=trending&search=google%2Fowlvit
177                "batch_size": 1,
178                "n_dataset_chunks": 1,
179            },
180            "google/owlv2-base-patch16-finetuned": {  # https://huggingface.co/models?pipeline_tag=zero-shot-object-detection&sort=trending&search=google%2Fowlv2
181                "batch_size": 1,
182                "n_dataset_chunks": 1,
183            },
184            "google/owlv2-large-patch14-ensemble": {
185                "batch_size": 1,
186                "n_dataset_chunks": 1,
187            },
188        },
189        "detection_threshold": 0.2,
190        "object_classes": [
191            "skater",
192            "child",
193            "bicycle",
194            "bicyclist",
195            "cyclist",
196            "bike",
197            "rider",
198            "motorcycle",
199            "motorcyclist",
200            "pedestrian",
201            "person",
202            "walker",
203            "jogger",
204            "runner",
205            "skateboarder",
206            "scooter",
207            "vehicle",
208            "car",
209            "bus",
210            "truck",
211            "taxi",
212            "van",
213            "pickup truck",
214            "trailer",
215            "emergency vehicle",
216            "delivery driver",
217        ],
218    },
219    "auto_label_mask": {
220        "semantic_segmentation": {
221            "sam2": {
222                "prompt_field": None,
223                "models": [
224                    "segment-anything-2-hiera-tiny-image-torch",
225                    "segment-anything-2-hiera-small-image-torch",
226                    "segment-anything-2-hiera-base-plus-image-torch",
227                    "segment-anything-2.1-hiera-tiny-image-torch",
228                    "segment-anything-2.1-hiera-small-image-torch",
229                    "segment-anything-2.1-hiera-base-plus-image-torch",
230                    "segment-anything-2.1-hiera-large-image-torch",
231                ],
232            },
233        },
234        "depth_estimation": {
235            "dpt": {
236                "models": {
237                    "Intel/dpt-swinv2-tiny-256",
238                    "Intel/dpt-swinv2-large-384",
239                    "Intel/dpt-beit-large-384",
240                    "Intel/dpt-beit-large-512",
241                    "Intel/dpt-large-ade",
242                    "Intel/dpt-large",
243                    "Intel/dpt-hybrid-midas",
244                    "Intel/dpt-swinv2-base-384",
245                    "Intel/dpt-beit-base-384",
246                },
247            },
248            "depth_anything": {
249                "models": {
250                    "LiheYoung/depth-anything-base-hf",
251                    "LiheYoung/depth-anything-large-hf",
252                    "LiheYoung/depth-anything-small-hf",
253                },
254            },
255            "depth_pro": {
256                "models": {
257                    "apple/DepthPro-hf",
258                }
259            },
260            "glpn": {
261                "models": {"vinvino02/glpn-nyu",
262                           "vinvino02/glpn-kitti"},
263            },
264            "zoe_depth": {
265                "models": {
266                    "Intel/zoedepth-nyu-kitti",
267                    "Intel/zoedepth-nyu",
268                    "Intel/zoedepth-kitti",
269                },
270            },
271        },
272    },
273    "ensemble_selection": {
274        "field_includes": "pred_zsod_",  # V51 field used for detections, "pred_zsod_" default for zero-shot object detection models
275        "agreement_threshold": 3,  # Threshold for n models necessary for agreement between models
276        "iou_threshold": 0.5,  # Threshold for IoU between bboxes to consider them as overlapping
277        "max_bbox_size": 0.01,  # Value between [0,1] for the max size of considered bboxes
278        "positive_classes": [  # Classes to consider, must be subset of available classes in the detections. Example for Vulnerable Road Users.
279            "skater",
280            "child",
281            "bicycle",
282            "bicyclist",
283            "cyclist",
284            "bike",
285            "rider",
286            "motorcycle",
287            "motorcyclist",
288            "pedestrian",
289            "person",
290            "walker",
291            "jogger",
292            "runner",
293            "skateboarder",
294            "scooter",
295            "delivery driver",
296        ],
297    },
298    "class_mapping": {
299        # get the source and target dataset names from datasets.yaml
300        "dataset_source": "fisheye8k",
301        "dataset_target": "mcity_fisheye_2000",
302        # Set to True to change detection labels in the dataset, Set to False to just add tags without changing labels in the dataset.
303        "change_labels": False,
304
305         # Choose any number of models from the options below hf_models_zeroshot_classification, to not include a model for class mapping, just comment it out
306         #https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForZeroShotImageClassification
307        "hf_models_zeroshot_classification": [
308            "Salesforce/blip2-itm-vit-g",
309            "openai/clip-vit-large-patch14",
310            "google/siglip-so400m-patch14-384",
311            # "google/siglip2-base-patch16-224",
312            "kakaobrain/align-base",
313            "BAAI/AltCLIP",
314            "CIDAS/clipseg-rd64-refined",
315        ],
316        "thresholds": {"confidence": 0.2},
317        "candidate_labels": {
318            # Target class(Generalized class) : Source classes(specific categories)
319            "Car": ["car", "van", "pickup"],
320            "Truck": ["truck", "pickup"],
321            # One_to_one_mapping
322            "Bike": ["motorbike/cycler"],
323            # Can add other class mappings in here
324        },
325    },
326    "data_ingest": {
327        "dataset_name": "custom_data",
328        "annotation_format": "auto",  # Options: "auto", "coco", "voc", "yolo", "image_only", "video"
329        "dataset_dir": "/home/dataengine/Downloads/vid",
330        "split_percentages": [0.7, 0.15, 0.15],  # Optional train/val/test
331        "fps": 2, #Frames per second to convert a Video dataset to Fiftyone Image Dataset
332    }
333}
334
335"""Global settings"""
336#: Non-persistent datasets are deleted from the database each time the database is shut down
337PERSISTENT = True
338#: Accepted splits for data processing
339ACCEPTED_SPLITS = ["train", "val", "test"]
340cpu_count = len(psutil.Process().cpu_affinity())
341#: Max. number of CPU workers
342NUM_WORKERS_MAX = 32
343NUM_WORKERS = NUM_WORKERS_MAX if cpu_count > NUM_WORKERS_MAX else cpu_count
344#: SEED for reproducability
345GLOBAL_SEED = 0
346
347"""Hugging Face Config"""
348#: Hugging Face name or Organization
349HF_ROOT = "mcity-data-engine"  # https://huggingface.co/mcity-data-engine
350#: Determins if model weights should be uploaded to Hugging Face
351HF_DO_UPLOAD = False
352
353"""Weights and Biases Config"""
354#: Determines if tracking with Weights and Biases is activated
355WANDB_ACTIVE = True
356
357"""Voxel51 Config"""
358#: Address for Voxel51 connection
359V51_ADDRESS = "localhost"
360#: Port for Voxel51 connection
361V51_PORT = 5151
362#: Remote app sessions will listen to any connection to their ports
363V51_REMOTE = True
SELECTED_WORKFLOW = ['auto_labeling']
SELECTED_DATASET = {'name': 'fisheye8k', 'n_samples': None, 'custom_view': None}
WORKFLOWS = {'aws_download': {'mcity': {'bucket': 'mcity-data-engine', 'prefix': '', 'download_path': 'output/datasets/annarbor_rolling', 'test_run': True, 'selected_dataset_overwrite': True}}, 'embedding_selection': {'mode': 'compute', 'parameters': {'compute_representativeness': 0.99, 'compute_unique_images_greedy': 0.01, 'compute_unique_images_deterministic': 0.99, 'compute_similar_images': 0.03, 'neighbour_count': 3}, 'embedding_models': ['clip-vit-base32-torch']}, 'anomaly_detection': {'mode': ['train', 'inference'], 'epochs': 12, 'early_stop_patience': 5, 'anomalib_image_models': {'Padim': {}}, 'anomalib_eval_metrics': ['AUPR', 'AUROC', 'F1Max'], 'data_preparation': {'fisheye8k': {'location': 'cam1', 'rare_class': 'Truck'}}}, 'auto_labeling': {'mode': ['train', 'inference'], 'model_source': ['roboflow'], 'n_worker_dataloader': 8, 'epochs': 1, 'early_stop_patience': 0, 'early_stop_threshold': 0, 'learning_rate': 5e-05, 'weight_decay': 0.0001, 'max_grad_norm': 0.01, 'inference_settings': {'do_eval': True, 'inference_on_test': True, 'model_hf': None, 'detection_threshold': 0.2}, 'hf_models_objectdetection': {'IDEA-Research/dab-detr-resnet-50': {'batch_size': 4, 'image_size': [960, 960]}}, 'custom_codetr': {'export_dataset_root': 'output/datasets/codetr_data/', 'configs': ['projects/configs/co_deformable_detr/co_deformable_detr_r50_1x_coco.py', 'projects/configs/co_dino_vit/co_dino_5scale_vit_large_coco.py'], 'n_gpus': '1', 'container_tool': 'docker'}, 'roboflow': {'export_dataset_root': 'output/datasets/roboflow_data/', 'configs': ['rfdetr_nano', 'rfdetr_small'], 'batch_size': 4, 'grad_accum_steps': 4, 'lr_encoder': None, 'resolution': None, 'use_ema': True, 'gradient_checkpointing': False, 'early_stopping_min_delta': 0.001, 'early_stopping_use_ema': True}, 'ultralytics': {'export_dataset_root': 'output/datasets/ultralytics_data/', 'multi_scale': False, 'cos_lr': True, 'models': {'yolo12n': {'batch_size': 8, 'img_size': 1280}}}}, 'auto_labeling_zero_shot': {'n_post_processing_worker_per_inference_worker': 5, 'n_worker_dataloader': 3, 'prefetch_factor_dataloader': 2, 'hf_models_zeroshot_objectdetection': {'omlab/omdet-turbo-swin-tiny-hf': {'batch_size': 1, 'n_dataset_chunks': 1}, 'IDEA-Research/grounding-dino-tiny': {'batch_size': 1, 'n_dataset_chunks': 1}, 'google/owlvit-large-patch14': {'batch_size': 1, 'n_dataset_chunks': 1}, 'google/owlv2-base-patch16-finetuned': {'batch_size': 1, 'n_dataset_chunks': 1}, 'google/owlv2-large-patch14-ensemble': {'batch_size': 1, 'n_dataset_chunks': 1}}, 'detection_threshold': 0.2, 'object_classes': ['skater', 'child', 'bicycle', 'bicyclist', 'cyclist', 'bike', 'rider', 'motorcycle', 'motorcyclist', 'pedestrian', 'person', 'walker', 'jogger', 'runner', 'skateboarder', 'scooter', 'vehicle', 'car', 'bus', 'truck', 'taxi', 'van', 'pickup truck', 'trailer', 'emergency vehicle', 'delivery driver']}, 'auto_label_mask': {'semantic_segmentation': {'sam2': {'prompt_field': None, 'models': ['segment-anything-2-hiera-tiny-image-torch', 'segment-anything-2-hiera-small-image-torch', 'segment-anything-2-hiera-base-plus-image-torch', 'segment-anything-2.1-hiera-tiny-image-torch', 'segment-anything-2.1-hiera-small-image-torch', 'segment-anything-2.1-hiera-base-plus-image-torch', 'segment-anything-2.1-hiera-large-image-torch']}}, 'depth_estimation': {'dpt': {'models': {'Intel/dpt-beit-base-384', 'Intel/dpt-hybrid-midas', 'Intel/dpt-beit-large-512', 'Intel/dpt-large', 'Intel/dpt-swinv2-base-384', 'Intel/dpt-swinv2-large-384', 'Intel/dpt-beit-large-384', 'Intel/dpt-large-ade', 'Intel/dpt-swinv2-tiny-256'}}, 'depth_anything': {'models': {'LiheYoung/depth-anything-large-hf', 'LiheYoung/depth-anything-small-hf', 'LiheYoung/depth-anything-base-hf'}}, 'depth_pro': {'models': {'apple/DepthPro-hf'}}, 'glpn': {'models': {'vinvino02/glpn-nyu', 'vinvino02/glpn-kitti'}}, 'zoe_depth': {'models': {'Intel/zoedepth-kitti', 'Intel/zoedepth-nyu-kitti', 'Intel/zoedepth-nyu'}}}}, 'ensemble_selection': {'field_includes': 'pred_zsod_', 'agreement_threshold': 3, 'iou_threshold': 0.5, 'max_bbox_size': 0.01, 'positive_classes': ['skater', 'child', 'bicycle', 'bicyclist', 'cyclist', 'bike', 'rider', 'motorcycle', 'motorcyclist', 'pedestrian', 'person', 'walker', 'jogger', 'runner', 'skateboarder', 'scooter', 'delivery driver']}, 'class_mapping': {'dataset_source': 'fisheye8k', 'dataset_target': 'mcity_fisheye_2000', 'change_labels': False, 'hf_models_zeroshot_classification': ['Salesforce/blip2-itm-vit-g', 'openai/clip-vit-large-patch14', 'google/siglip-so400m-patch14-384', 'kakaobrain/align-base', 'BAAI/AltCLIP', 'CIDAS/clipseg-rd64-refined'], 'thresholds': {'confidence': 0.2}, 'candidate_labels': {'Car': ['car', 'van', 'pickup'], 'Truck': ['truck', 'pickup'], 'Bike': ['motorbike/cycler']}}, 'data_ingest': {'dataset_name': 'custom_data', 'annotation_format': 'auto', 'dataset_dir': '/home/dataengine/Downloads/vid', 'split_percentages': [0.7, 0.15, 0.15], 'fps': 2}}

Global settings

PERSISTENT = True
ACCEPTED_SPLITS = ['train', 'val', 'test']
cpu_count = 4
NUM_WORKERS_MAX = 32
NUM_WORKERS = 4
GLOBAL_SEED = 0

Hugging Face Config

HF_ROOT = 'mcity-data-engine'
HF_DO_UPLOAD = False

Weights and Biases Config

WANDB_ACTIVE = True

Voxel51 Config

V51_ADDRESS = 'localhost'
V51_PORT = 5151
V51_REMOTE = True