Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions litellm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@
####################
logging: bool = True
enable_loadbalancing_on_batch_endpoints: Optional[bool] = None
skip_batch_token_counting_providers: Optional[List[str]] = None
enable_caching_on_provider_specific_optional_params: bool = (
False # feature-flag for caching on optional params - e.g. 'top_k'
)
Expand Down
14 changes: 12 additions & 2 deletions litellm/proxy/hooks/batch_rate_limiter.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,14 +244,24 @@ async def count_input_file_usage(
) -> BatchFileUsage:
"""
Count number of requests and tokens in a batch input file.

Args:
file_id: The file ID to read
custom_llm_provider: The custom LLM provider to use for token encoding

Returns:
BatchFileUsage with total_tokens and request_count
"""
skip_providers = litellm.skip_batch_token_counting_providers or []
if custom_llm_provider in skip_providers:
verbose_proxy_logger.debug(
f"Skipping batch token counting for provider: {custom_llm_provider}"
)
return BatchFileUsage(
total_tokens=0,
request_count=0,
)

try:
# Read file content
file_content = await litellm.afile_content(
Expand Down
70 changes: 70 additions & 0 deletions tests/batches_tests/test_batch_rate_limits.py
Original file line number Diff line number Diff line change
Expand Up @@ -389,3 +389,73 @@ async def test_batch_rate_limit_multiple_requests():
print(f" Error: {exc_info.value.detail}")
finally:
os.unlink(file_path_2)


@pytest.mark.asyncio()
async def test_skip_batch_token_counting_for_providers():
"""
Test that batch token counting can be skipped for configured providers.

When skip_batch_token_counting_providers includes a provider, the batch rate limiter
should return zero tokens and requests without attempting to download the file.
This is useful for providers like vertex_ai where batch files are stored in GCS
and downloading large files for token counting is impractical.
"""
import litellm

original_value = litellm.skip_batch_token_counting_providers

try:
litellm.skip_batch_token_counting_providers = ["vertex_ai"]

batch_limiter = _PROXY_BatchRateLimiter(
internal_usage_cache=None,
parallel_request_limiter=None,
)

result = await batch_limiter.count_input_file_usage(
file_id="gs://test-bucket/test.jsonl",
custom_llm_provider="vertex_ai",
)

assert result.total_tokens == 0, "Should return 0 tokens when provider is in skip list"
assert result.request_count == 0, "Should return 0 requests when provider is in skip list"
print("✓ Token counting skipped for vertex_ai provider")
finally:
litellm.skip_batch_token_counting_providers = original_value


@pytest.mark.asyncio()
async def test_skip_batch_token_counting_multiple_providers():
"""
Test that multiple providers can be configured in skip list.
"""
import litellm

original_value = litellm.skip_batch_token_counting_providers

try:
litellm.skip_batch_token_counting_providers = ["vertex_ai", "azure"]

batch_limiter = _PROXY_BatchRateLimiter(
internal_usage_cache=None,
parallel_request_limiter=None,
)

result_vertex = await batch_limiter.count_input_file_usage(
file_id="gs://test-bucket/test.jsonl",
custom_llm_provider="vertex_ai",
)
assert result_vertex.total_tokens == 0
assert result_vertex.request_count == 0

result_azure = await batch_limiter.count_input_file_usage(
file_id="azure-file-id",
custom_llm_provider="azure",
)
assert result_azure.total_tokens == 0
assert result_azure.request_count == 0

print("✓ Token counting skipped for multiple providers")
finally:
litellm.skip_batch_token_counting_providers = original_value
Loading