Skip to content

Commit cb1f50b

Browse files
Aishwarya-TonpeUbuntu
authored andcommitted
Lint fixes
1 parent 575859b commit cb1f50b

17 files changed

Lines changed: 940 additions & 107 deletions

File tree

docs/user-tutorial/benchmarks/model-benchmarks.md

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,33 @@ For inference, supported percentiles include
3434

3535
**New: Support fp8_hybrid and fp8_e4m3 precision for BERT models.**
3636

37+
**New: Deterministic Training Support**
38+
SuperBench now supports deterministic training to ensure reproducibility across runs. This includes fixed seeds and deterministic algorithms. To enable deterministic training, use the following flags:
39+
40+
- **Flags:**
41+
- `--enable_determinism`: Enables deterministic computation for reproducible results.
42+
- `--deterministic_seed <seed>`: Sets the seed for reproducibility (default: 42).
43+
- `--check_frequency <steps>`: How often to record deterministic metrics (default: 100).
44+
45+
- **Environment Variables (set automatically by SuperBench when `--enable_determinism` is used):**
46+
- `CUBLAS_WORKSPACE_CONFIG=:4096:8`: Ensures deterministic behavior in cuBLAS. This can be overridden by setting it manually before running SuperBench.
47+
48+
**Comparing Deterministic Results**
49+
50+
To compare deterministic results between runs, use the standard result analysis workflow:
51+
52+
1. Run benchmark with `--enable_determinism` flag
53+
2. Generate baseline: `sb result generate-baseline --data-file results.jsonl --summary-rule-file rules.yaml`
54+
3. Compare future runs: `sb result diagnosis --data-file new-results.jsonl --rule-file rules.yaml --baseline-file baseline.json`
55+
56+
This allows configurable tolerance for floating-point differences via YAML rules.
57+
58+
**Configuration Parameter Validation**
59+
60+
When determinism is enabled, benchmark configuration parameters (batch_size, num_steps, deterministic_seed, etc.) are automatically recorded in the results file as `deterministic_config_*` metrics. The diagnosis rules enforce exact matching of these parameters between runs to ensure valid comparisons:
61+
62+
If any configuration parameter differs between runs, the diagnosis will flag it as a failure, ensuring you only compare runs with identical configurations.
63+
3764
#### Metrics
3865

3966
| Name | Unit | Description |
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
# Copyright (c) Microsoft Corporation.
2+
# Licensed under the MIT license.
3+
4+
"""Unified PyTorch deterministic training example for all supported models.
5+
6+
Deterministic metrics (loss, activation mean) are automatically stored in results
7+
when --enable_determinism flag is enabled.
8+
9+
To compare deterministic results between runs, use the `sb result diagnosis` command
10+
with a baseline file and comparison rules. See the SuperBench documentation for details.
11+
12+
Example workflow:
13+
1. Run first benchmark (creates outputs/<timestamp>/results-summary.jsonl):
14+
python3 examples/benchmarks/pytorch_deterministic_example.py \
15+
--model resnet101 --enable_determinism --deterministic_seed 42
16+
17+
2. Generate baseline from results:
18+
sb result generate-baseline --data-file outputs/<timestamp>/results-summary.jsonl \
19+
--summary-rule-file summary-rules.yaml --output-dir outputs/<timestamp>
20+
21+
3. Run second benchmark:
22+
python3 examples/benchmarks/pytorch_deterministic_example.py \
23+
--model resnet101 --enable_determinism --deterministic_seed 42
24+
25+
4. Compare runs with diagnosis:
26+
sb result diagnosis --data-file outputs/<run2-timestamp>/results-summary.jsonl \
27+
--rule-file rules.yaml --baseline-file outputs/<run1-timestamp>/baseline.json
28+
29+
Note: CUBLAS_WORKSPACE_CONFIG is now automatically set by the code when determinism is enabled.
30+
"""
31+
32+
import argparse
33+
import json
34+
import socket
35+
from datetime import datetime
36+
from pathlib import Path
37+
from superbench.benchmarks import BenchmarkRegistry, Framework
38+
from superbench.common.utils import logger
39+
40+
MODEL_CHOICES = [
41+
'bert-large',
42+
'gpt2-small',
43+
'llama2-7b',
44+
'mixtral-8x7b',
45+
'resnet101',
46+
'lstm',
47+
]
48+
49+
DEFAULT_PARAMS = {
50+
'bert-large':
51+
'--batch_size 1 --seq_len 64 --num_warmup 1 --num_steps 200 --precision float32 '
52+
'--model_action train --check_frequency 20',
53+
'gpt2-small':
54+
'--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 128 --precision float32 '
55+
'--model_action train --check_frequency 20',
56+
'llama2-7b':
57+
'--batch_size 1 --num_steps 300 --num_warmup 1 --seq_len 512 --precision float32 --model_action train '
58+
'--check_frequency 20',
59+
'mixtral-8x7b':
60+
'--hidden_size 4096 --num_hidden_layers 32 --num_attention_heads 32 --intermediate_size 14336 '
61+
'--num_key_value_heads 8 --max_position_embeddings 32768 --router_aux_loss_coef 0.02 '
62+
'--check_frequency 20',
63+
'resnet101':
64+
'--batch_size 1 --precision float32 --num_warmup 1 --num_steps 120 --sample_count 8192 '
65+
'--pin_memory --model_action train --check_frequency 20',
66+
'lstm':
67+
'--batch_size 1 --num_steps 100 --num_warmup 2 --seq_len 64 --precision float32 '
68+
'--model_action train --check_frequency 30',
69+
}
70+
71+
72+
def main():
73+
"""Main function for determinism example file."""
74+
parser = argparse.ArgumentParser(description='Unified PyTorch deterministic training example.')
75+
parser.add_argument('--model', type=str, choices=MODEL_CHOICES, required=True, help='Model to run.')
76+
parser.add_argument(
77+
'--enable_determinism',
78+
action='store_true',
79+
help='Enable deterministic mode for reproducible results.',
80+
)
81+
parser.add_argument(
82+
'--deterministic_seed',
83+
type=int,
84+
default=None,
85+
help='Seed for deterministic training.',
86+
)
87+
args = parser.parse_args()
88+
89+
parameters = DEFAULT_PARAMS[args.model]
90+
if args.enable_determinism:
91+
parameters += ' --enable_determinism'
92+
if args.deterministic_seed is not None:
93+
parameters += f' --deterministic_seed {args.deterministic_seed}'
94+
95+
context = BenchmarkRegistry.create_benchmark_context(args.model, parameters=parameters, framework=Framework.PYTORCH)
96+
benchmark = BenchmarkRegistry.launch_benchmark(context)
97+
logger.info(f'Benchmark finished. Return code: {benchmark.return_code}')
98+
99+
# Create timestamped output directory
100+
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
101+
output_dir = Path('outputs') / timestamp
102+
output_dir.mkdir(parents=True, exist_ok=True)
103+
104+
# Parse benchmark results
105+
benchmark_results = json.loads(benchmark.serialized_result)
106+
benchmark_name = benchmark_results.get('name', f'pytorch-{args.model}')
107+
108+
# Convert to results-summary.jsonl format (flattened keys)
109+
# Use format compatible with sb result commands: model-benchmarks:<category>/<benchmark>/<metric>
110+
summary = {}
111+
prefix = f'model-benchmarks:example:determinism/{benchmark_name}'
112+
if 'result' in benchmark_results:
113+
for metric, values in benchmark_results['result'].items():
114+
# Use first value if it's a list
115+
val = values[0] if isinstance(values, list) else values
116+
# Add _rank0 suffix to deterministic metrics for compatibility with rules
117+
if metric.startswith('deterministic_'):
118+
metric_key = f'{prefix}/{metric}_rank0'
119+
else:
120+
metric_key = f'{prefix}/{metric}'
121+
summary[metric_key] = val
122+
123+
# Add node identifier
124+
summary['node'] = socket.gethostname()
125+
126+
# Write results-summary.jsonl
127+
summary_file = output_dir / 'results-summary.jsonl'
128+
with open(summary_file, 'w') as f:
129+
f.write(json.dumps(summary))
130+
logger.info(f'Results saved to {summary_file}')
131+
132+
# Also save full results for reference
133+
full_results_file = output_dir / 'results-full.json'
134+
with open(full_results_file, 'w') as f:
135+
json.dump(benchmark_results, f, indent=2)
136+
137+
if 'raw_data' in benchmark_results and 'deterministic_loss' in benchmark_results['raw_data']:
138+
num_checkpoints = len(benchmark_results['raw_data']['deterministic_loss'][0])
139+
logger.info(f'Periodic fingerprints collected at {num_checkpoints} checkpoints')
140+
141+
logger.info(
142+
f'To generate baseline: sb result generate-baseline '
143+
f'--data-file {summary_file} --summary-rule-file summary-rules.yaml '
144+
f'--output-dir {output_dir}'
145+
)
146+
logger.info('To compare results between runs, use `sb result diagnosis` command.')
147+
148+
149+
if __name__ == '__main__':
150+
main()

superbench/analyzer/baseline_generation.py

Lines changed: 29 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,33 @@ def generate_baseline(self, algo, aggregated_df, diagnosis_rule_file, baseline):
150150
aggregated_df[metrics[index]] = out[1]
151151
return baseline
152152

153+
def _format_metric_value(self, metric, val, digit):
154+
"""Format a single baseline metric value based on its type.
155+
156+
Args:
157+
metric (str): the metric name.
158+
val: the metric value.
159+
digit (int): the number of digits after the decimal point.
160+
161+
Returns:
162+
The formatted metric value.
163+
"""
164+
if metric not in self._raw_data_df:
165+
return val
166+
sample = self._raw_data_df[metric].iloc[0]
167+
if isinstance(sample, float):
168+
# Keep full precision for deterministic metrics to avoid false positives in diagnosis
169+
if 'deterministic' in metric:
170+
return float(val)
171+
return f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
172+
if isinstance(sample, int):
173+
return int(val)
174+
try:
175+
return float(val)
176+
except Exception as e:
177+
logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
178+
return val
179+
153180
def run(
154181
self, raw_data_file, summary_rule_file, diagnosis_rule_file, pre_baseline_file, algorithm, output_dir, digit=2
155182
):
@@ -174,19 +201,9 @@ def run(
174201
# generate baseline accordint to rules in diagnosis and fix threshold outlier detection method
175202
baseline = self.generate_baseline(algorithm, self._raw_data_df, diagnosis_rule_file, baseline)
176203
for metric in baseline:
177-
val = baseline[metric]
178-
if metric in self._raw_data_df:
179-
if isinstance(self._raw_data_df[metric].iloc[0], float):
180-
baseline[metric] = f'%.{digit}g' % val if abs(val) < 1 else f'%.{digit}f' % val
181-
elif isinstance(self._raw_data_df[metric].iloc[0], int):
182-
baseline[metric] = int(val)
183-
else:
184-
try:
185-
baseline[metric] = float(val)
186-
except Exception as e:
187-
logger.error('Analyzer: {} baseline is not numeric, msg: {}'.format(metric, str(e)))
204+
baseline[metric] = self._format_metric_value(metric, baseline[metric], digit)
188205
baseline = json.dumps(baseline, indent=2, sort_keys=True)
189-
baseline = re.sub(r': \"(\d+.?\d*)\"', r': \1', baseline)
206+
baseline = re.sub(r': \"(-?\d+\.?\d*)\"', r': \1', baseline)
190207
with (Path(output_dir) / 'baseline.json').open('w') as f:
191208
f.write(baseline)
192209

superbench/analyzer/data_diagnosis.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,10 @@ def output_all_nodes_results(self, raw_data_df, data_not_accept_df):
238238
'Category','Defective Details']
239239
"""
240240
append_columns = ['Accept', 'Number Of Issues', 'Category', 'Defective Details']
241-
all_data_df = (raw_data_df).astype('float64')
241+
# Preserve all columns, but only convert numeric columns to float64
242+
all_data_df = raw_data_df.copy()
243+
numeric_cols = all_data_df.select_dtypes(include=['number']).columns
244+
all_data_df[numeric_cols] = all_data_df[numeric_cols].astype('float64')
242245

243246
if data_not_accept_df.shape[0] == 0:
244247
all_data_df['Accept'] = [True for i in range(len(all_data_df))]

superbench/benchmarks/base.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,14 +110,25 @@ def parse_args(self, ignore_invalid=False):
110110
logger.error('Invalid argument - benchmark: {}, message: {}.'.format(self._name, str(e)))
111111
return False, None, []
112112

113-
ret = True
113+
ret = self._check_unknown_args(unknown)
114+
115+
return ret, args, unknown
116+
117+
def _check_unknown_args(self, unknown):
118+
"""Check for unknown arguments and log an error if any are found.
119+
120+
Args:
121+
unknown (list): List of unknown arguments.
122+
123+
Returns:
124+
bool: False if unknown arguments are found, True otherwise.
125+
"""
114126
if len(unknown) > 0:
115127
logger.error(
116128
'Unknown arguments - benchmark: {}, unknown arguments: {}'.format(self._name, ' '.join(unknown))
117129
)
118-
ret = False
119-
120-
return ret, args, unknown
130+
return False
131+
return True
121132

122133
def _preprocess(self):
123134
"""Preprocess/preparation operations before the benchmarking.

superbench/benchmarks/model_benchmarks/model_base.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,17 @@ def _generate_dataset(self):
186186
"""
187187
pass
188188

189+
def set_deterministic_seed(self):
190+
"""Hook to set deterministic RNG state before dataset generation.
191+
192+
Framework-specific subclasses may
193+
override this to apply deterministic RNG settings (for example,
194+
PyTorch benchmarks implement this to call their deterministic setup
195+
when requested). This is called from _preprocess() before
196+
_generate_dataset().
197+
"""
198+
return None
199+
189200
@abstractmethod
190201
def _init_dataloader(self):
191202
"""Initialize the dataloader.
@@ -221,6 +232,12 @@ def _preprocess(self):
221232
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
222233
return False
223234

235+
# Invoke model-specific deterministic seeding hook before dataset generation
236+
try:
237+
self.set_deterministic_seed()
238+
except Exception:
239+
logger.info('set_deterministic_seed() hook failed or not implemented for model: %s', self._name)
240+
224241
# Set sample_count aligned with batch_size.
225242
self._args.sample_count = math.ceil(self._args.sample_count / self._args.batch_size) * self._args.batch_size
226243

0 commit comments

Comments
 (0)