Creating a Custom Dataset
Defining a SymbolLibrary
The SymbolLibrary defines which tokens an approach may use. Start from the curated default set and restrict it to what's relevant for your problem:
from SRToolkit.utils import SymbolLibrary
# Restrict to a specific token set for a 2-variable problem
sl = SymbolLibrary.from_symbol_list(
["+", "-", "*", "/", "sin", "cos", "exp", "sqrt", "^2", "^3", "C"],
num_variables=2,
)
Full list of supported default tokens is documented in SymbolLibrary.default_symbols.
Creating a standalone SR_dataset
SR_dataset wraps input data and evaluation settings for one problem:
import numpy as np
from SRToolkit.dataset import SR_dataset
from SRToolkit.utils import SymbolLibrary
rng = np.random.default_rng(0)
X = rng.uniform(0.5, 5.0, size=(1000, 2))
y = X[:, 0] ** 2 + np.sin(X[:, 1])
sl = SymbolLibrary.from_symbol_list(
["+", "*", "sin", "^2", "C"], num_variables=2
)
dataset = SR_dataset(
X=X,
y=y,
symbol_library=sl,
dataset_name="my_equation",
original_equation="x0^2 + sin(x1)",
ground_truth=["X_0", "^2", "+", "sin", "(", "X_1", ")"],
max_evaluations=50000,
success_threshold=1e-6,
constant_bounds=(-10.0, 10.0),
max_expr_length=20,
)
Key parameters:
| Parameter | Purpose |
|---|---|
ground_truth |
Token list of the known solution; used to compute BED and check success |
success_threshold |
Error below which an expression counts as solved |
constant_bounds |
Search range for free constants during parameter fitting |
max_evaluations |
Budget passed to the SR approach |
max_expr_length |
Maximum token list length the evaluator will accept |
Building a benchmark with SR_benchmark
Group multiple datasets into a reusable benchmark by subclassing SR_benchmark:
import os
import numpy as np
from platformdirs import user_data_dir
from SRToolkit.dataset import SR_benchmark
from SRToolkit.utils import SymbolLibrary
_SYMBOL_LIST = ["+", "-", "*", "/", "sin", "cos", "^2", "^3", "C"]
class MyBenchmark(SR_benchmark):
def __init__(self, dataset_directory=os.path.join(user_data_dir("SRToolkit"), "my_benchmark")):
super().__init__("MyBenchmark", dataset_directory)
self._populate()
def _populate(self):
os.makedirs(self.base_dir, exist_ok=True)
sl_1v = SymbolLibrary.from_symbol_list(_SYMBOL_LIST, num_variables=1)
sl_2v = SymbolLibrary.from_symbol_list(_SYMBOL_LIST, num_variables=2)
self.add_dataset(
"", # "" → load from base_dir/dataset_name.npz
sl_1v,
dataset_name="eq1",
ground_truth=["X_0", "^2", "+", "C"],
original_equation="x^2 + c",
ranking_function="rmse",
max_evaluations=50000,
success_threshold=1e-6,
constant_bounds=(-10.0, 10.0),
max_expr_length=20,
seed=None,
)
self.add_dataset(
"",
sl_2v,
dataset_name="eq2",
ground_truth=["sin", "(", "X_0", ")", "+", "X_1", "^2"],
original_equation="sin(x0) + x1^2",
ranking_function="rmse",
max_evaluations=50000,
success_threshold=1e-6,
constant_bounds=(-10.0, 10.0),
max_expr_length=20,
seed=None,
)
def resample(self, dataset_name: str, n: int, seed=None):
from SRToolkit.utils import SymbolLibrary
from SRToolkit.utils.expression_compiler import compile_expr
info = self.datasets[dataset_name]
rng = np.random.default_rng(seed)
sl = SymbolLibrary.from_dict(info["symbol_library"])
n_vars = sl.num_variables
X = rng.uniform(0.5, 5.0, size=(n, n_vars))
f = compile_expr(info["ground_truth"], sl)
y = f(X, np.array([]))
return X, y
The add_dataset data argument
The first positional argument to add_dataset controls where data comes from:
| Value | Behaviour |
|---|---|
"" |
Load {base_dir}/{dataset_name}.npz from disk |
np.ndarray (X only) |
Compute y from ground_truth, then save .npz |
(X, y) tuple |
Use directly and save .npz |
Pass a numpy array on first run to generate and cache the data, then switch to "" for subsequent loads — or generate the .npz files externally and always use "".