Skip to content

Commit 8734df7

Browse files
committed
remove --multi-token-probs
1 parent 75fe775 commit 8734df7

File tree

7 files changed

+0
-24
lines changed

7 files changed

+0
-24
lines changed

common/arg.cpp

-10
Original file line numberDiff line numberDiff line change
@@ -1085,16 +1085,6 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10851085
params.sampling.grammar = json_schema_to_grammar(json::parse(value));
10861086
}
10871087
).set_sparam());
1088-
add_opt(common_arg(
1089-
{"-mtp", "--multi-token-probs"},
1090-
string_format(
1091-
"allow getting probabilities for multiple tokens. note: this will slow down the generation speed (default: %s)",
1092-
params.sampling.multi_token_probs ? "enabled" : "disabled"
1093-
),
1094-
[](common_params & params) {
1095-
params.sampling.multi_token_probs = true;
1096-
}
1097-
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MULTI_TOKEN_PROBS"));
10981088
add_opt(common_arg(
10991089
{"--pooling"}, "{none,mean,cls,last,rank}",
11001090
"pooling type for embeddings, use model default if unspecified",

common/common.h

-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,6 @@ struct common_params_sampling {
134134
bool ignore_eos = false;
135135
bool no_perf = false; // disable performance metrics
136136
bool timing_per_token = false;
137-
bool multi_token_probs = false; // output probabilities for multiple tokens (when n_probs > 0)
138137

139138
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
140139

examples/server/README.md

-1
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ The project is under active development, and we are [looking for feedback and co
139139
| `-sp, --special` | special tokens output enabled (default: false) |
140140
| `--no-warmup` | skip warming up the model with an empty run |
141141
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
142-
| `-mtp, --multi-token-probs` | allow getting probabilities for multiple tokens. note: this will slow down the generation speed (default: disabled)<br/>(env: LLAMA_ARG_MULTI_TOKEN_PROBS) |
143142
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
144143
| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
145144
| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |

examples/server/server.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -243,10 +243,6 @@ struct server_task {
243243
params.speculative.n_min = std::max(params.speculative.n_min, 2);
244244
params.speculative.n_max = std::max(params.speculative.n_max, 0);
245245

246-
if (!params_base.sampling.multi_token_probs && params.n_predict > 1 && params.sampling.n_probs > 0) {
247-
throw std::runtime_error("For performance reason, n_probs with n_predict > 1 is not allowed. To enable this, start the server with --multi-token-probs");
248-
}
249-
250246
// TODO: add more sanity checks for the input parameters
251247

252248
if (params.sampling.penalty_last_n < -1) {

examples/server/tests/unit/test_chat_completion.py

-2
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ def test_chat_completion_with_timings_per_token():
166166

167167
def test_logprobs():
168168
global server
169-
server.multi_token_probs = True
170169
server.start()
171170
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
172171
res = client.chat.completions.create(
@@ -194,7 +193,6 @@ def test_logprobs():
194193

195194
def test_logprobs_stream():
196195
global server
197-
server.multi_token_probs = True
198196
server.start()
199197
client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}")
200198
res = client.chat.completions.create(

examples/server/tests/unit/test_completion.py

-3
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,6 @@ def check_slots_status():
259259

260260
def test_n_probs():
261261
global server
262-
server.multi_token_probs = True
263262
server.start()
264263
res = server.make_request("POST", "/completion", data={
265264
"prompt": "I believe the meaning of life is",
@@ -285,7 +284,6 @@ def test_n_probs():
285284

286285
def test_n_probs_stream():
287286
global server
288-
server.multi_token_probs = True
289287
server.start()
290288
res = server.make_stream_request("POST", "/completion", data={
291289
"prompt": "I believe the meaning of life is",
@@ -313,7 +311,6 @@ def test_n_probs_stream():
313311

314312
def test_n_probs_post_sampling():
315313
global server
316-
server.multi_token_probs = True
317314
server.start()
318315
res = server.make_request("POST", "/completion", data={
319316
"prompt": "I believe the meaning of life is",

examples/server/tests/utils.py

-3
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ class ServerProcess:
7474
draft_min: int | None = None
7575
draft_max: int | None = None
7676
no_webui: bool | None = None
77-
multi_token_probs: bool | None = None
7877

7978
# session variables
8079
process: subprocess.Popen | None = None
@@ -165,8 +164,6 @@ def start(self, timeout_seconds: int = 10) -> None:
165164
server_args.extend(["--draft-min", self.draft_min])
166165
if self.no_webui:
167166
server_args.append("--no-webui")
168-
if self.multi_token_probs:
169-
server_args.append("--multi-token-probs")
170167

171168
args = [str(arg) for arg in [server_path, *server_args]]
172169
print(f"bench: starting server with: {' '.join(args)}")

0 commit comments

Comments
 (0)