evalscope/Makefile at main · modelscope/evalscope · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# default rule
default: install

# ============================================================================
# Documentation Generation
# ============================================================================
#
# WORKFLOW (full pipeline):
#   docs-update → docs-translate → docs-generate → docs-en / docs-zh
#
# STEP-BY-STEP:
#   Step 1  docs-update[/stats]  Read adapter metadata → write _meta/<name>.json
#   Step 2  docs-translate       Translate readme.en → readme.zh via LLM API
#   Step 3  docs-generate        Read all _meta/*.json → write docs/*/benchmarks/*.md
#   Step 4  docs-en / docs-zh    Sphinx build → docs/*/build/html/
#
# WHAT IS AFFECTED:
#   docs-update        Writes evalscope/benchmarks/_meta/<name>.json (metadata only)
#   docs-update-stats  Same as above + downloads dataset to compute sample statistics
#   docs-translate     Updates readme.zh field inside each _meta/<name>.json
#   docs-generate      Overwrites docs/en/benchmarks/*.md + docs/zh/benchmarks/*.md
#
# PARAMETERS:
#   BENCHMARK  Specific benchmark name (e.g. gsm8k, mmlu).
#              Omit to process ALL registered benchmarks.
#   FORCE=1    Force recompute/re-translate even when data already exists.
#              Applies to docs-update, docs-update-stats, and docs-translate.
#   WORKERS    Parallel worker count for update / translate (default: 4).
#
# COMMON USAGE:
#   make docs                               # Full pipeline: translate → generate → build HTML
#   make docs-update                        # Update metadata for ALL benchmarks
#   make docs-update BENCHMARK=gsm8k        # Update metadata for ONE benchmark
#   make docs-update BENCHMARK="gsm8k mmlu"  # Update metadata for MULTIPLE benchmarks
#   make docs-update-stats                  # Update metadata + stats for ALL benchmarks
#   make docs-update-stats BENCHMARK=gsm8k  # Update metadata + stats for ONE benchmark
#   make docs-update-stats BENCHMARK="gsm8k mmlu"  # Update metadata + stats for MULTIPLE benchmarks
#   make docs-translate                     # Translate only untranslated benchmarks (ALL)
#   make docs-translate BENCHMARK=gsm8k     # Translate ONE benchmark (skip if done)
#   make docs-translate BENCHMARK="gsm8k mmlu"  # Translate MULTIPLE benchmarks
#   make docs-translate FORCE=1             # Force re-translate ALL benchmarks
#   make docs-translate BENCHMARK=gsm8k FORCE=1  # Force re-translate ONE benchmark
#   make docs-pipeline BENCHMARK=gsm8k     # update-stats + translate + generate for ONE benchmark
#   make docs-pipeline BENCHMARK="gsm8k mmlu"  # update-stats + translate + generate for MULTIPLE
#   make docs-pipeline BENCHMARK=gsm8k FORCE=1  # Force update-stats + translate + generate
#   make docs-generate                      # Regenerate .md files from persisted JSON data
#   make docs-en                            # Build English HTML docs only
#   make docs-zh                            # Build Chinese HTML docs only
#
# ============================================================================

# Parameters
# BENCHMARK: one or more benchmark names, space-separated (e.g. BENCHMARK="gsm8k mmlu")
BENCHMARK ?=
FORCE     ?=
WORKERS   ?= 4

# Internal helpers
# When BENCHMARK is set: pass name(s) as positional args; otherwise use --all flag
_BENCH_ARGS = $(if $(BENCHMARK),$(BENCHMARK),--all)
# When FORCE is non-empty (e.g. FORCE=1): append --force flag
_FORCE_FLAG = $(if $(FORCE),--force,)

.PHONY: docs
docs: docs-translate docs-generate
	$(MAKE) docs-en
	$(MAKE) docs-zh

.PHONY: docs-update
docs-update:
	python -m evalscope.cli.cli benchmark-info $(_BENCH_ARGS) --update $(_FORCE_FLAG) --workers $(WORKERS)

.PHONY: docs-update-stats
docs-update-stats:
	python -m evalscope.cli.cli benchmark-info $(_BENCH_ARGS) --update --compute-stats $(_FORCE_FLAG) --workers $(WORKERS)

.PHONY: docs-translate
docs-translate:
	python -m evalscope.cli.cli benchmark-info $(_BENCH_ARGS) --translate $(_FORCE_FLAG) --workers $(WORKERS)

.PHONY: docs-pipeline
docs-pipeline:
	python -m evalscope.cli.cli benchmark-info $(_BENCH_ARGS) --update --compute-stats $(_FORCE_FLAG) --workers $(WORKERS)
	python -m evalscope.cli.cli benchmark-info $(_BENCH_ARGS) --translate $(_FORCE_FLAG) --workers $(WORKERS)
	python -m evalscope.cli.cli benchmark-info --generate-docs

.PHONY: docs-generate
docs-generate:
	python -m evalscope.cli.cli benchmark-info --generate-docs

.PHONY: docs-en
docs-en:
	cd docs/en && make clean && make html

.PHONY: docs-zh
docs-zh:
	cd docs/zh && make clean && make html

# ============================================================================
# Frontend (evalscope/web)
# ============================================================================

.PHONY: web-install
web-install:
	cd evalscope/web && npm install

.PHONY: web-build
web-build:
	cd evalscope/web && npm install && npm run build

.PHONY: web-dev
web-dev:
	cd evalscope/web && npm install && npm run dev

# ============================================================================
# Development
# ============================================================================

.PHONY: lint
lint:
	pre-commit run --all-files

.PHONY: dev
dev:
	pip install -e '.[dev,perf,docs]'
	pip install pre-commit

.PHONY: install
install:
	pip install -e .