19. Data loader API
This page documents the data loader API used to list, inspect, and load datasets. For usage examples, see the dataset how-to.
19.1 What it is for
The data loader brick resolves dataset identifiers, downloads raw data, caches processed datasets, and returns canonical splits. [1][2]
19.2 Examples
Load a curated dataset:
from modssc.data_loader import load_dataset
ds = load_dataset("toy", download=True)
print(ds.train.X.shape, ds.train.y.shape)
Inspect the catalog and provider list:
from modssc.data_loader import available_datasets, available_providers, dataset_info
print(available_datasets())
print(available_providers())
print(dataset_info("toy").as_dict())
The public API is exported from src/modssc/data_loader/__init__.py. [3]
19.3 API reference
Dataset download, caching and loading (canonical datasets only).
This module is responsible for:
- resolving dataset identifiers (catalog keys or provider URIs)
- downloading raw data into a local cache
- materializing a canonical dataset (official splits only when provided)
- storing processed data + manifests with stable fingerprints
It does NOT implement experimental splits (holdout, kfold, label fraction).
Those belong to a dedicated sampling/splitting component.
19.4
DataLoaderError
Bases: RuntimeError
Base error for modssc.data_loader.
Source code in src/modssc/data_loader/errors.py
| class DataLoaderError(RuntimeError):
"""Base error for modssc.data_loader."""
|
19.5
DatasetIdentity
dataclass
Resolved dataset identity (provider level).
Source code in src/modssc/data_loader/types.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155 | @dataclass(frozen=True)
class DatasetIdentity:
"""Resolved dataset identity (provider level)."""
provider: str
canonical_uri: str
dataset_id: str
version: str | None
modality: str
task: str
required_extra: str | None = None
resolved_kwargs: Mapping[str, Any] = field(default_factory=dict)
def as_dict(self) -> dict[str, Any]:
return {
"provider": self.provider,
"canonical_uri": self.canonical_uri,
"dataset_id": self.dataset_id,
"version": self.version,
"modality": self.modality,
"task": self.task,
"required_extra": self.required_extra,
"resolved_kwargs": dict(self.resolved_kwargs),
}
def fingerprint_payload(self, *, schema_version: int) -> dict[str, Any]:
return {
"schema_version": int(schema_version),
"provider": self.provider,
"canonical_uri": self.canonical_uri,
"dataset_id": self.dataset_id,
"version": self.version,
"modality": self.modality,
"task": self.task,
"resolved_kwargs": dict(self.resolved_kwargs),
}
def fingerprint(self, *, schema_version: int) -> str:
payload = self.fingerprint_payload(schema_version=schema_version)
try:
blob = json.dumps(
payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True
).encode("utf-8")
except TypeError as e:
raise ValueError("DatasetIdentity.resolved_kwargs must be JSON serializable.") from e
return hashlib.sha256(blob).hexdigest()
|
19.6
DatasetRequest
dataclass
A dataset request.
- id can be a curated key or a provider URI
- options can override or extend catalog source_kwargs
Source code in src/modssc/data_loader/types.py
98
99
100
101
102
103
104
105
106
107 | @dataclass(frozen=True)
class DatasetRequest:
"""A dataset request.
- id can be a curated key or a provider URI
- options can override or extend catalog source_kwargs
"""
id: str
options: Mapping[str, Any] = field(default_factory=dict)
|
19.7
DatasetSpec
dataclass
Curated dataset spec (catalog entry).
The fingerprint used for caching intentionally ignores documentation-only fields.
Source code in src/modssc/data_loader/types.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95 | @dataclass(frozen=True)
class DatasetSpec:
"""Curated dataset spec (catalog entry).
The fingerprint used for caching intentionally ignores documentation-only fields.
"""
key: str
provider: str
uri: str
modality: str
task: str
description: str
required_extra: str | None = None
source_kwargs: Mapping[str, Any] = field(default_factory=dict)
homepage: str | None = None
license: str | None = None
citation: str | None = None
def as_dict(self) -> dict[str, Any]:
return {
"key": self.key,
"provider": self.provider,
"uri": self.uri,
"modality": self.modality,
"task": self.task,
"description": self.description,
"required_extra": self.required_extra,
"source_kwargs": dict(self.source_kwargs),
"homepage": self.homepage,
"license": self.license,
"citation": self.citation,
}
def fingerprint_payload(self, *, schema_version: int) -> dict[str, Any]:
"""Payload used to compute the cache fingerprint.
Only fields that can change dataset bytes are included.
"""
return {
"schema_version": int(schema_version),
"provider": self.provider,
"uri": self.uri,
"source_kwargs": dict(self.source_kwargs),
}
def fingerprint(self, *, schema_version: int) -> str:
payload = self.fingerprint_payload(schema_version=schema_version)
try:
blob = json.dumps(
payload, sort_keys=True, separators=(",", ":"), ensure_ascii=True
).encode("utf-8")
except TypeError as e:
raise ValueError("DatasetSpec.source_kwargs must be JSON serializable.") from e
return hashlib.sha256(blob).hexdigest()
|
19.7.1
fingerprint_payload(*, schema_version)
Payload used to compute the cache fingerprint.
Only fields that can change dataset bytes are included.
Source code in src/modssc/data_loader/types.py
75
76
77
78
79
80
81
82
83
84
85 | def fingerprint_payload(self, *, schema_version: int) -> dict[str, Any]:
"""Payload used to compute the cache fingerprint.
Only fields that can change dataset bytes are included.
"""
return {
"schema_version": int(schema_version),
"provider": self.provider,
"uri": self.uri,
"source_kwargs": dict(self.source_kwargs),
}
|
19.8
DownloadReport
dataclass
Report returned by download_all_datasets.
Source code in src/modssc/data_loader/types.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177 | @dataclass(frozen=True)
class DownloadReport:
"""Report returned by download_all_datasets."""
downloaded: Sequence[str] = ()
skipped_already_cached: Sequence[str] = ()
skipped_missing_extras: Sequence[str] = ()
missing_extras: Mapping[str, Sequence[str]] = field(default_factory=dict)
failed: Mapping[str, str] = field(default_factory=dict)
def has_failures(self) -> bool:
return bool(self.failed)
def summary(self) -> str:
lines: list[str] = []
lines.append(f"Downloaded: {len(self.downloaded)}")
lines.append(f"Skipped (already cached): {len(self.skipped_already_cached)}")
lines.append(f"Skipped (missing extras): {len(self.skipped_missing_extras)}")
lines.append(f"Failed: {len(self.failed)}")
return "\n".join(lines)
|
19.9
LoadedDataset
dataclass
Canonical dataset container.
If the provider supplies official splits, test may be present.
If not, test is None.
This module does not create custom splits.
Source code in src/modssc/data_loader/types.py
24
25
26
27
28
29
30
31
32
33
34
35
36 | @dataclass(frozen=True)
class LoadedDataset:
"""Canonical dataset container.
If the provider supplies official splits, test may be present.
If not, test is None.
This module does not create custom splits.
"""
train: Split
test: Split | None = None
meta: Mapping[str, Any] = field(default_factory=dict)
|
19.10
OptionalDependencyError
dataclass
Bases: 19.4 DataLoaderError
Raised when an optional dependency (extra) required by a provider is missing.
Source code in src/modssc/data_loader/errors.py
25
26
27
28
29
30
31
32
33
34
35
36
37 | @dataclass(frozen=True)
class OptionalDependencyError(DataLoaderError):
"""Raised when an optional dependency (extra) required by a provider is missing."""
extra: str
purpose: str | None = None
def __str__(self) -> str:
msg = f"Missing optional dependency extra: {self.extra!r}."
if self.purpose:
msg += f" Required for: {self.purpose}."
msg += f' Install with: pip install "modssc[{self.extra}]"'
return msg
|
19.11
Split
dataclass
A canonical dataset split.
X and y are backend-agnostic containers (often numpy arrays).
edges and masks are used for graph datasets.
Source code in src/modssc/data_loader/types.py
10
11
12
13
14
15
16
17
18
19
20
21 | @dataclass(frozen=True)
class Split:
"""A canonical dataset split.
X and y are backend-agnostic containers (often numpy arrays).
edges and masks are used for graph datasets.
"""
X: Any
y: Any
edges: Any | None = None
masks: Mapping[str, Any] | None = None
|
19.12
available_providers()
Public helper: list provider names.
Source code in src/modssc/data_loader/api.py
| def available_providers() -> list[str]:
"""Public helper: list provider names."""
return get_provider_names()
|
19.13
load_dataset(dataset_id, *, cache_dir=None, download=True, force=False, options=None, as_numpy=False, allow_object=True)
Load a dataset from processed cache, optionally downloading if missing.
Source code in src/modssc/data_loader/api.py
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157 | def load_dataset(
dataset_id: str,
*,
cache_dir: Path | None = None,
download: bool = True,
force: bool = False,
options: Mapping[str, Any] | None = None,
as_numpy: bool = False,
allow_object: bool = True,
) -> LoadedDataset:
"""Load a dataset from processed cache, optionally downloading if missing."""
start = perf_counter()
layout = _layout(cache_dir)
req = DatasetRequest(id=dataset_id, options=options or {})
identity = _resolve_identity(req)
fp = identity.fingerprint(schema_version=SCHEMA_VERSION)
logger.info(
"Dataset load: id=%s provider=%s version=%s fingerprint=%s download=%s force=%s cache_dir=%s",
dataset_id,
identity.provider,
identity.version,
fp,
bool(download),
bool(force),
str(layout.root),
)
logger.debug("Dataset resolved_kwargs: %s", dict(identity.resolved_kwargs))
if not force and cache.is_cached(layout, fp):
ds = _load_processed(layout, fp)
n_train, n_classes = _split_stats(ds.train)
n_test, _ = _split_stats(ds.test)
logger.info(
"Dataset cached: id=%s train=%s test=%s n_classes=%s duration_s=%.3f",
dataset_id,
n_train,
n_test,
n_classes,
perf_counter() - start,
)
return dataset_to_numpy(ds, allow_object=allow_object) if as_numpy else ds
if not download:
raise DatasetNotCachedError(dataset_id)
ds = _download_and_store(layout, identity, force=force)
n_train, n_classes = _split_stats(ds.train)
n_test, _ = _split_stats(ds.test)
logger.info(
"Dataset ready: id=%s train=%s test=%s n_classes=%s duration_s=%.3f",
dataset_id,
n_train,
n_test,
n_classes,
perf_counter() - start,
)
return dataset_to_numpy(ds, allow_object=allow_object) if as_numpy else ds
|
19.14
to_numpy(value, *, dtype=None, allow_object=True)
Best effort conversion to numpy without importing heavy frameworks.
Source code in src/modssc/data_loader/numpy_adapter.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45 | def to_numpy(value: Any, *, dtype: Any | None = None, allow_object: bool = True) -> np.ndarray:
"""Best effort conversion to numpy without importing heavy frameworks."""
if isinstance(value, np.ndarray):
return value.astype(dtype, copy=False) if dtype is not None else value
if hasattr(value, "to_numpy"):
arr = value.to_numpy()
return np.asarray(arr, dtype=dtype) if dtype is not None else np.asarray(arr)
obj = value
if hasattr(obj, "detach"):
try:
obj = obj.detach()
except Exception:
obj = value
if hasattr(obj, "cpu"):
try:
obj = obj.cpu()
except Exception:
obj = obj
if hasattr(obj, "numpy"):
try:
arr = obj.numpy()
return np.asarray(arr, dtype=dtype) if dtype is not None else np.asarray(arr)
except Exception:
pass
try:
return np.asarray(obj, dtype=dtype)
except Exception:
if allow_object:
arr = np.empty((1,), dtype=object)
arr[0] = obj
return arr
raise
|
Sources
src/modssc/data_loader/api.py
src/modssc/data_loader/types.py
src/modssc/data_loader/__init__.py