對話完成
主要端點。與 OpenAI Chat Completions API 相容。
/api/v1/chat/completions請求內文
請求結構 (TypeScript)
type Request = {
// Required
model: string; // "provider/model-name"
messages: Message[];
// Common
stream?: boolean; // Default: true
temperature?: number; // Range: [0, 2], default: 0.7
max_tokens?: number; // Range: [1, context_length)
n?: number; // Default: 1
seed?: integer; // Deterministic sampling
stop?: string | string[];
// Sampling
top_p?: number; // Range: (0, 1]
top_k?: integer; // Default: 0 (disabled)
frequency_penalty?: number; // Range: [-2, 2]
presence_penalty?: number; // Range: [-2, 2]
repetition_penalty?: number; // Range: (0, 2], default: 1
min_p?: number; // Range: [0, 1]
top_a?: number; // Range: [0, 1]
// Logprobs
logit_bias?: Record<number, number>; // Token ID → bias [-100, 100]
logprobs?: boolean;
top_logprobs?: number; // Range: [0, 20], requires logprobs: true
// Tools & output
tools?: Tool[];
tool_choice?: ToolChoice;
parallel_tool_calls?: boolean; // Default: true
response_format?: ResponseFormat;
// BazaarLink-only
transforms?: string[]; // e.g. ["middle-out"]
models?: string[]; // Fallback model list
route?: "fallback";
provider?: ProviderPreferences;
debug?: {
echo_upstream_body?: boolean; // Streaming only
};
};
type Message =
| { role: "system" | "user" | "assistant"; content: string | ContentPart[] }
| { role: "tool"; content: string; tool_call_id: string };
type ContentPart =
| { type: "text"; text: string }
| { type: "image_url"; image_url: { url: string; detail?: string } };
type Tool = {
type: "function";
function: {
name: string;
description?: string;
parameters: object; // JSON Schema
};
};
type ToolChoice =
| "none" | "auto" | "required"
| { type: "function"; function: { name: string } };
type ResponseFormat =
| { type: "json_object" }
| { type: "json_schema"; json_schema: { name: string; strict?: boolean; schema: object } };
type ProviderPreferences = {
order?: string[];
only?: string[];
ignore?: string[];
allow_fallbacks?: boolean;
sort?: "price" | "latency" | "throughput";
};範例請求
curl https://bazaarlink.ai/api/v1/chat/completions \
-H "Authorization: Bearer $BAZAARLINK_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-4.1",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Explain quantum computing in one paragraph."}
],
"temperature": 0.7,
"max_tokens": 512
}'回應
{
"id": "chatcmpl-abc123",
"object": "chat.completion",
"created": 1740000000,
"model": "openai/gpt-4.1",
"choices": [
{
"index": 0,
"message": {
"role": "assistant",
"content": "Quantum computing leverages quantum mechanics..."
},
"finish_reason": "stop"
}
],
"usage": {
"prompt_tokens": 28,
"completion_tokens": 74,
"total_tokens": 102,
"cost": 0.0006480,
"prompt_tokens_details": {
"cached_tokens": 0
},
"completion_tokens_details": {
"reasoning_tokens": 0
}
}
}回應結構 (TypeScript)
type Response = {
id: string;
object: "chat.completion" | "chat.completion.chunk";
created: number; // Unix timestamp
model: string;
choices: (NonStreamingChoice | StreamingChoice)[];
usage?: ResponseUsage;
cost?: number; // Total cost in USD
};
type NonStreamingChoice = {
index: number;
finish_reason: "stop" | "length" | "tool_calls" | "content_filter" | null;
native_finish_reason: string | null; // Provider's original finish reason
message: {
role: "assistant";
content: string | null;
tool_calls?: ToolCall[];
};
};
type StreamingChoice = {
index: number;
finish_reason: string | null;
native_finish_reason: string | null; // Provider's original finish reason
delta: {
role?: string;
content?: string | null;
tool_calls?: ToolCall[];
};
};
type ResponseUsage = {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
cost: number; // Total cost for this request in USD
prompt_tokens_details?: {
cached_tokens: number; // Tokens served from prompt cache (reduced cost)
cache_write_tokens?: number; // Tokens written to cache in this request
audio_tokens?: number;
};
completion_tokens_details?: {
reasoning_tokens?: number; // Thinking/reasoning tokens (e.g. o3, Qwen3, DeepSeek R1)
image_tokens?: number;
};
};
type ToolCall = {
id: string;
type: "function";
function: { name: string; arguments: string };
};Responses API
相容 OpenAI Responses API 格式的端點,支援無狀態多輪對話、工具呼叫與多模態輸入。適用於使用 OpenAI Python SDK ≥ 1.x 的 client.responses.create() 的 Agent 框架。
/api/v1/responses請求內文
請求結構 (TypeScript)
type ResponsesRequest = {
model: string; // "provider/model-name"
input: string | InputItem[]; // string or multi-turn array
// Optional
instructions?: string; // System-level message
stream?: boolean; // Default: false
max_output_tokens?: number;
temperature?: number; // Range: [0, 2], default: 0.7
top_p?: number;
tools?: Tool[];
tool_choice?: "auto" | "none" | "required" | object;
parallel_tool_calls?: boolean; // Default: true
previous_response_id?: string; // Not supported — use full input array
provider?: ProviderPreferences; // Same as Chat Completions
};
type InputItem =
| { type?: "message"; role: "user" | "assistant" | "system" | "developer"; content: string | ContentBlock[] }
| { type: "function_call_output"; call_id: string; output: string } // tool result
| { type: "function_call"; call_id: string; name: string; arguments: string };
type ContentBlock =
| { type: "input_text"; text: string }
| { type: "input_image"; image_url: string; detail?: "auto" | "low" | "high" };範例請求
curl https://bazaarlink.ai/api/v1/responses \
-H "Authorization: Bearer $ROUTEFREE_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"model": "openai/gpt-4o-mini",
"instructions": "You are a helpful assistant.",
"input": "What is the capital of Taiwan?"
}'回應格式
// Non-streaming response object
type ResponsesResponse = {
id: string; // "resp_..."
object: "response";
created_at: number;
completed_at: number;
status: "completed" | "failed" | "incomplete";
model: string;
output: OutputItem[];
usage: {
input_tokens: number; // equivalent to prompt_tokens
output_tokens: number; // equivalent to completion_tokens
total_tokens: number;
cost?: number; // actual cost in credits
} | null;
error: null | { code: string; message: string };
};
type OutputItem =
| {
type: "message";
id: string;
role: "assistant";
status: "completed";
content: Array<{ type: "output_text"; text: string; annotations: [] }>;
}
| { type: "function_call"; id: string; call_id: string; name: string; arguments: string; status: "completed" };從 Chat Completions 遷移
將 messages 改為 input(字串或陣列),以 instructions 取代 system 角色訊息,並從 output[0].content[0].text 讀取回應內容(原為 choices[0].message.content)。
# Chat Completions (before)
response = client.chat.completions.create(
model="openai/gpt-4o-mini",
messages=[
{"role": "system", "content": "You are helpful."},
{"role": "user", "content": "Hello"},
]
)
text = response.choices[0].message.content
# Responses API (after)
response = client.responses.create(
model="openai/gpt-4o-mini",
instructions="You are helpful.",
input="Hello"
)
text = response.output[0].content[0].text限制事項
- previous_response_id 會被接受但忽略 — 請使用無狀態模式(傳入完整 input 陣列)。
- 不支援內建工具(web_search_preview、file_search、computer_use_preview)。
- 不支援 background: true(非同步執行)。
模型
列出所有可用模型及其定價和能力資訊。 此端點不需要身份驗證。
/api/v1/modelscurl https://bazaarlink.ai/api/v1/models
回應
{
"data": [
{
"id": "openai/gpt-4.1",
"name": "GPT 4.1",
"context_length": 1047576,
"modality": "text+image+file->text",
"pricing": {
"prompt": "2.00",
"completion": "8.00"
}
}
]
}// /v1/models — Response Schema
type ModelsResponse = {
data: Model[];
};
type Model = {
id: string; // Model ID (e.g. "openai/gpt-4.1")
name: string; // Human-readable name
context_length: number | null; // Max context window in tokens
modality: string | null; // e.g. "text->text", "text+image->text"
pricing: {
prompt: string; // Input price per 1M tokens (USD)
completion: string; // Output price per 1M tokens (USD)
};
description?: string | null; // Model description
top_provider?: {
max_completion_tokens?: number;
};
supported_parameters?: string[]; // e.g. ["tools", "response_format", "reasoning"]
};串流
設定 stream: true 以接收 Server-Sent Events (SSE) 串流。每個事件包含一個回應片段。
from openai import OpenAI
client = OpenAI(
base_url="https://bazaarlink.ai/api/v1",
api_key="sk-bl-YOUR_API_KEY",
)
stream = client.chat.completions.create(
model="anthropic/claude-sonnet-4.6",
messages=[{"role": "user", "content": "Count to 10 slowly."}],
stream=True,
)
for chunk in stream:
content = chunk.choices[0].delta.content
if content:
print(content, end="", flush=True)SSE 格式
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":"Hello"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{"content":" world"},"index":0}]}
data: {"id":"chatcmpl-abc","choices":[{"delta":{},"finish_reason":"stop","index":0}],"usage":{"prompt_tokens":10,"completion_tokens":4,"total_tokens":14}}
data: [DONE]嵌入向量
生成與 OpenAI Embeddings API 相容的文字嵌入向量。
/api/v1/embeddingsfrom openai import OpenAI
client = OpenAI(
base_url="https://bazaarlink.ai/api/v1",
api_key="sk-bl-YOUR_API_KEY",
)
response = client.embeddings.create(
model="openai/text-embedding-3-small",
input="The quick brown fox jumps over the lazy dog",
)
print(response.data[0].embedding) # 1536-dimensional vector專用參數
取樣參數影響 token 產生過程。BazaarLink 會將支援的參數傳遞給上游 provider;不支援的參數會被静默忽略。
取樣參數
BazaarLink 專屬參數
工具呼叫
工具呼叫(也稱為函式呼叫)讓模型可以呼叫您定義的外部函式。模型會決定何時呼叫工具並產生結構化參數 — 您的程式碼負責執行函式並將結果回傳以繼續對話。
支援的模型
大多數前沿模型都支援工具呼叫。以下是一些熱門選擇:
定義工具
每個工具是一個描述模型可呼叫函式的 JSON 物件。parameters 欄位使用 JSON Schema。
tool_choice 選項
完整流程
工具呼叫是一個多輪流程:(1) 帶工具發送請求 → (2) 模型回傳 tool_calls → (3) 執行函式 → (4) 回傳結果 → (5) 模型生成最終回應。
import json
from openai import OpenAI
client = OpenAI(
base_url="https://bazaarlink.ai/api/v1",
api_key="sk-bl-YOUR_API_KEY",
)
# Step 1: Define tools and send request
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get current weather for a city",
"parameters": {
"type": "object",
"properties": {
"city": {"type": "string", "description": "City name"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["city"]
}
}
}]
response = client.chat.completions.create(
model="openai/gpt-4.1",
messages=[{"role": "user", "content": "What's the weather in Taipei?"}],
tools=tools,
tool_choice="auto",
)
# Step 2: Check for tool calls
message = response.choices[0].message
if message.tool_calls:
tool_call = message.tool_calls[0]
args = json.loads(tool_call.function.arguments)
# Step 3: Execute your function
result = {"temperature": 28, "unit": "celsius", "condition": "Partly cloudy"}
# Step 4: Send result back
final = client.chat.completions.create(
model="openai/gpt-4.1",
messages=[
{"role": "user", "content": "What's the weather in Taipei?"},
message,
{"role": "tool", "tool_call_id": tool_call.id, "content": json.dumps(result)},
],
tools=tools,
)
# Step 5: Get final response
print(final.choices[0].message.content)
# "The weather in Taipei is 28°C and partly cloudy."平行工具呼叫
某些模型可以在單一回應中呼叫多個工具。處理每個工具呼叫並回傳所有結果:
# Model may return multiple tool_calls
if message.tool_calls:
messages = [
{"role": "user", "content": "Weather and time in Tokyo?"},
message,
]
for tool_call in message.tool_calls:
# Execute each function
if tool_call.function.name == "get_weather":
result = {"temperature": 22, "condition": "Clear"}
elif tool_call.function.name == "get_time":
result = {"time": "2026-02-23T15:30:00+09:00"}
messages.append({
"role": "tool",
"tool_call_id": tool_call.id,
"content": json.dumps(result),
})
# Send all results back at once
final = client.chat.completions.create(
model="openai/gpt-4.1",
messages=messages,
tools=tools,
)
print(final.choices[0].message.content)結構化輸出
強制模型返回符合 Schema 的有效 JSON。這對於建立需要程式化解析模型輸出的可靠應用程式至關重要。
方法 1:response_format(JSON Schema)
以強制嚴格的 JSON Schema 合規性:
response = client.chat.completions.create(
model="openai/gpt-4.1",
messages=[{"role": "user", "content": "Review the movie Inception"}],
response_format={
"type": "json_schema",
"json_schema": {
"name": "movie_review",
"strict": True,
"schema": {
"type": "object",
"properties": {
"title": {"type": "string"},
"rating": {"type": "integer", "description": "Rating 1-10"},
"summary": {"type": "string"},
"pros": {"type": "array", "items": {"type": "string"}},
"cons": {"type": "array", "items": {"type": "string"}},
},
"required": ["title", "rating", "summary", "pros", "cons"],
"additionalProperties": False,
},
},
},
)
import json
review = json.loads(response.choices[0].message.content)
print(review["title"]) # "Inception"
print(review["rating"]) # 9提示
- 使用清晰、描述性的屬性名稱 — 模型會將其作為上下文。
- 為 Schema 屬性添加描述來引導模型。
- 設定 strict: true 以保證 Schema 合規(可能略微增加延遲)。
- 保持 Schema 簡單 — 深度巢狀的 Schema 可能降低輸出品質。
- 使用不同模型測試 — 某些模型處理複雜 Schema 的能力更強。
助手預填
透過在訊息陣列末尾加入部分 assistant 訊息,引導模型以特定方式回應。模型會從您中斷的地方繼續。
response = client.chat.completions.create(
model="anthropic/claude-sonnet-4.6",
messages=[
{"role": "user", "content": "What is the capital of France?"},
{"role": "assistant", "content": "The capital of France is"},
],
)
# Model continues: " Paris, known for the Eiffel Tower..."
print(response.choices[0].message.content)模型路由
BazaarLink 使用 provider/model-name 格式將請求路由到正確的上游供應商。這讓您可以透過單一 API 端點存取 200+ 個模型。
模型 ID 格式
{provider}/{model-name}
# Examples:
openai/gpt-4.1
anthropic/claude-sonnet-4.6
google/gemini-2.5-flash
deepseek/deepseek-chat
meta-llama/llama-4-maverick路由優先順序
當您發送請求時,BazaarLink 依以下順序解析上游供應商:
- 精確匹配 — 尋找與完整模型 ID 匹配的模型路由
- 供應商萬用字元 — 回退至 provider/* 路由(例如 openai/*)
- 全域萬用字元 — 回退至 * 萬用字元路由
- 預設供應商 — 使用標記為預設的供應商金鑰
- 環境回退 — 使用配置的 API 金鑰作為最後手段
在 模型頁面瀏覽所有可用模型。
故障轉移
當供應商發生故障或返回錯誤時,BazaarLink 可以自動使用替代模型重試您的請求。這確保了高可用性,無需對您的程式碼做任何更改。
運作方式
- 您的請求會發送到主要模型。
- 如果主要模型失敗(5xx 錯誤、逾時或速率限制),BazaarLink 會自動使用列表中的下一個模型重試。
- 這個過程會持續到某個模型成功或所有模型都已嘗試。
- 回應會包含一個標頭,指示實際服務請求的模型。
最佳實踐
- 按偏好順序排列模型 — 第一個模型始終最先被嘗試。
- 混合不同供應商以獲得最大彈性(例如 OpenAI → Anthropic → Google)。
- 使用能力相似的模型以確保一致的結果。
- 設定合理的逾時時間以避免在備用觸發前等待過久。
- 監控 X-Fallback-Used 標頭以追蹤供應商可靠性。
供應商選擇
控制路由請求時使用的供應商。在請求主體中加入 provider 物件來自訂路由行為。BazaarLink 會根據上游供應商的支援情況,在本地或原生應用您的偏好設定。
支援欄位
| Field | Type | Description |
|---|---|---|
| order | string[] | Provider slugs to try in order |
| allow_fallbacks | boolean | Allow providers outside order/only as fallbacks (default: true) |
| only | string[] | Only allow these provider slugs |
| ignore | string[] | Skip these provider slugs |
| sort | string | object | "price" | "throughput" | "latency" (or { by, partition }) |
| quantizations | string[] | 按量化等級過濾 |
| data_collection | string | "allow" | "deny" |
| require_parameters | boolean | 僅使用支援所有參數的供應商 |
| max_price | object | { prompt, completion } 每百萬 token 最高價格 |
| zdr | boolean | 僅路由至零資料保留端點 |
| enforce_distillable_text | boolean | 僅路由至允許文字褨琉的模型 |
| preferred_min_throughput | number | object | 首選最小吐汐顿(tokens/秒) |
| preferred_max_latency | number | object | 首選最大延遲(秒) |
排序供應商
使用 order 欄位指定優先嘗試的供應商。不在列表中的供應商將作為回退選項(除非 allow_fallbacks 為 false)。
{
"model": "meta-llama/llama-4-maverick",
"messages": [{"role": "user", "content": "Hello"}],
"provider": {
"order": ["together", "fireworks"]
}
}過濾供應商
使用 only 來白名單指定供應商,或使用 ignore 來黑名單排除。這些過濾器會在排序之前應用。
// Only use specific providers
{
"model": "openai/gpt-4o",
"provider": { "only": ["openai"] }
}
// Skip a provider
{
"model": "openai/gpt-4o",
"provider": { "ignore": ["deepinfra"] }
}按價格排序
設定 sort 為 'price' 以自動路由到最便宜的可用供應商。throughput 和 latency 排序也會傳遞至支援的上游。
{
"model": "meta-llama/llama-4-maverick",
"messages": [{"role": "user", "content": "Hello"}],
"provider": {
"sort": "price"
}
}停用回退
將 allow_fallbacks 設為 false 以嚴格限制路由到您指定的或白名單中的供應商。若無可用供應商,請求將失敗而非回退。
{
"model": "openai/gpt-4o",
"messages": [{"role": "user", "content": "Hello"}],
"provider": {
"order": ["openai", "azure"],
"allow_fallbacks": false
}
}進階參數透傳
進階供應商參數(包括 quantizations、data_collection、require_parameters、max_price 等)會傳遞至支援的上游供應商原生處理。
// Advanced provider selection example
{
"model": "deepseek/deepseek-v3.2",
"messages": [{"role": "user", "content": "Hello"}],
"provider": {
"order": ["deepinfra", "together"],
"sort": "throughput",
"quantizations": ["fp8"],
"data_collection": "deny",
"require_parameters": true,
"allow_fallbacks": true
}
}模型變體
在任何模型 ID 後加上後綴來改變路由行為。BazaarLink 支援 7 種變體類型。
變體類型
有兩類變體:獨立模型 ID(帶後綴的模型是獨立的端點)和路由捷徑(後綴改變 BazaarLink 選擇供應商的方式,但不改變模型本身)。
獨立模型 ID
這些變體作為獨立模型存在,各自擁有定價和功能。BazaarLink 優先嘗試完整模型 ID(含後綴),若無匹配則回退至基礎模型。
| Suffix | Description | Example |
|---|---|---|
| :free | Free-tier version (rate-limited) | deepseek/deepseek-r1:free |
| :extended | Extended context window | anthropic/claude-sonnet-4.5:extended |
| :thinking | Extended reasoning / chain-of-thought | deepseek/deepseek-r1:thinking |
| :exacto | Curated providers for tool-calling accuracy | moonshotai/kimi-k2-0905:exacto |
路由捷徑
這些後綴修改供應商選擇方式,不改變模型身份。路由匹配前會先去除後綴。
| Suffix | Equivalent | Behaviour |
|---|---|---|
| :nitro | provider.sort="throughput" | Prioritise highest throughput providers |
| :floor | provider.sort="price" | Sort candidates by price ASC (cheapest first) |
| :online | plugins: { web: {} } | 啟用即時網路搜尋 |
多供應商行為
對於支援變體的上游供應商,後綴會原樣傳遞。對於直連供應商(如直連 OpenAI、Fireworks),後綴會被去除,由 BazaarLink 在本地處理路由。
範例
// Independent variant — use free tier
{
"model": "deepseek/deepseek-r1:free",
"messages": [{"role": "user", "content": "Hello"}]
}
// Routing shortcut — cheapest provider first
{
"model": "meta-llama/llama-4-maverick:floor",
"messages": [{"role": "user", "content": "Hello"}]
}
// Routing shortcut — highest throughput
{
"model": "openai/gpt-4o:nitro",
"messages": [{"role": "user", "content": "Hello"}]
}
// Web search
{
"model": "openai/gpt-4o:online",
"messages": [{"role": "user", "content": "What happened today?"}]
}訊息轉換
自動轉換訊息以符合模型上下文限制。當您的訊息超過模型的上下文窗口時,轉換會從對話中間移除訊息,以智慧地壓縮對話。
用法
// Enable middle-out on any model
{
"model": "openai/gpt-4.1",
"transforms": ["middle-out"],
"messages": [
{ "role": "system", "content": "You are a helpful assistant." },
... // long conversation — middle will be trimmed to fit context
]
}
// Disable auto-trimming for small-context models
{ "transforms": [] }轉換類型
預設行為
上下文 ≤ 8k 的模型預設啟用 middle-out。較大上下文的模型需明確傳入 `transforms: ["middle-out"]` 才會啟用。Anthropic Claude 模型無論 transforms 設定為何,均自動強制執行 1,000 則訊息上限。
零完成保險 BETA
針對請求完全失敗(上游無法建立連線)的情況提供計費保護。串流從未開始即失敗時不會收費。
已保障情境
- 上游拒絕連線 / 回傳空 body — 全額退款
- 串流從未開始即失敗 — 全額退款
未涵蓋情境
串流已開始後中途中斷:收取預留金的 10% 最低費用。模型回傳 0 output tokens(空內容):仍依 input token 計費。
安全護欄 BETA
為您的 API 請求新增安全護欄,過濾有害內容、執行合規政策並保護您的應用程式。
規劃中功能
目前行為
所有上游供應商都有自己的內容安全系統。觸發內容過濾的模型回應將返回 finish_reason: "content_filter"。自訂護欄配置將在未來更新中提供。
零資料保留
BazaarLink 預設不儲存您的訊息內容。本頁說明您的資料處理方式,適用於處理敏感資料的應用程式。
目前的資料處理方式
- 訊息內容:預設不儲存,在記憶體中處理後立即丟棄
- 計費元資料:token 數量、時間戳記、模型 ID(保留 90 天)
- 使用日誌:請求統計,不含訊息內容
- 上游轉發:訊息轉發至上游供應商,受其隱私政策約束
提示快取
提示快取可以重用之前計算過的 prompt tokens,顯著降低成本並減少延遲,特別適合有大量重複系統提示的應用程式。
運作方式
快取由各模型供應商在後端自動處理,無需額外設定。BazaarLink 透明代理快取相關參數,並在使用量回應中回報結果。支援快取的模型在相同前綴重複出現時,讀取 tokens 的費用通常為正常費率的 10–50%。
response = client.chat.completions.create(
model="anthropic/claude-3-7-sonnet",
messages=[
{"role": "system", "content": "You are an expert..."}, # Long system prompt cached
{"role": "user", "content": "Question here"},
],
)
# Check cache savings in the response usage
usage = response.usage
print(f"Prompt tokens: {usage.prompt_tokens}")
print(f"Cached tokens: {usage.prompt_tokens_details.cached_tokens}")
print(f"Cache savings: {usage.prompt_tokens_details.cached_tokens / usage.prompt_tokens * 100:.1f}%")推理 Tokens
推理模型(如 DeepSeek R1、o1 系列)在生成最終答案之前,會先在內部進行思考。這些思考過程消耗的 tokens 稱為推理 tokens,會分開計費。
在回應中讀取推理 Tokens
response = client.chat.completions.create(
model="deepseek/deepseek-r1",
messages=[{"role": "user", "content": "Solve: if f(x) = x^2 + 3x, what is f(5)?"}],
)
# Read reasoning tokens from usage
usage = response.usage
print(f"Completion tokens: {usage.completion_tokens}")
if hasattr(usage, "completion_tokens_details"):
details = usage.completion_tokens_details
print(f"Reasoning tokens: {details.reasoning_tokens}")
print(f"Output tokens: {details.accepted_prediction_tokens}")const response = await client.chat.completions.create({
model: "openai/o3-mini",
messages: [{ role: "user", content: "Prove that sqrt(2) is irrational." }],
// @ts-ignore - BazaarLink extension
reasoning_effort: "high", // low | medium | high
});
const usage = response.usage;
console.log("Reasoning tokens:", usage?.completion_tokens_details?.reasoning_tokens);Thinking Mode Control
Some models support toggling their "thinking" mode. Thinking mode generates internal reasoning tokens before producing the final answer, improving quality at the cost of more tokens.
| Model Family | Parameter | Default |
|---|---|---|
| qwen3-* | enable_thinking: boolean | false (platform default) |
| openai/o1, o3, o4-mini | reasoning_effort: "low" | "medium" | "high" | medium |
| deepseek/deepseek-r1 | — | Always enabled (cannot disable) |
# Qwen3: explicitly enable thinking mode
response = client.chat.completions.create(
model="qwen/qwen3-32b",
messages=[{"role": "user", "content": "Prove the Pythagorean theorem"}],
extra_body={"enable_thinking": True}, # opt-in to thinking
)
# usage.completion_tokens_details.reasoning_tokens shows thinking token countUnified reasoning Object (New Format)
BazaarLink also supports the unified reasoning object, which works across all model families with a single consistent API:
| Field | Values | Applies to |
|---|---|---|
| reasoning.effort | "xhigh" | "high" | "medium" | "low" | "none" | OpenAI o-series, Grok |
| reasoning.max_tokens | integer | Anthropic Claude, Gemini |
| reasoning.exclude | boolean | Hide thinking from response (model still reasons) |
// Claude extended thinking — specify thinking budget in tokens
const response = await client.chat.completions.create({
model: "anthropic/claude-sonnet-4-5",
messages: [{ role: "user", content: "Prove the Pythagorean theorem" }],
// @ts-ignore - BazaarLink extension
reasoning: { max_tokens: 5000 },
});
// OpenAI o3 — specify effort level
const response2 = await client.chat.completions.create({
model: "openai/o3",
messages: [{ role: "user", content: "Solve this math problem..." }],
// @ts-ignore - BazaarLink extension
reasoning: { effort: "high" },
});
// Hide thinking content from response (model still thinks)
const response3 = await client.chat.completions.create({
model: "anthropic/claude-sonnet-4-5",
messages: [{ role: "user", content: "What is 2+2?" }],
// @ts-ignore - BazaarLink extension
reasoning: { max_tokens: 2000, exclude: true },
});延遲與效能
優化 AI API 的回應延遲對用戶體驗至關重要。以下是 BazaarLink 架構中影響延遲的關鍵因素及最佳化建議。
影響延遲的因素
- 模型大小:較大的模型(70B+)通常生成速度較慢
- 提供商負載:不同時段不同供應商的負載有所差異
- Token 數量:max_tokens 越大,完成時間越長
- 串流 vs 非串流:串流(stream: true)可更快取得第一個 token
- 上下文長度:超長 context 會增加前置處理時間
最佳化建議
- 優先使用串流(stream: true)以改善感知延遲
- 使用 :nitro 變體選擇高吞吐量供應商
- 對延遲敏感的場景選擇較小的模型(flash/mini/haiku)
- 使用 provider.sort: "latency" 自動選擇最低延遲供應商
- 啟用提示快取以降低重複請求的延遲
import time
# Measure time to first token with streaming
start = time.time()
first_token_time = None
stream = client.chat.completions.create(
model="google/gemini-2.0-flash-001", # Fast model
messages=[{"role": "user", "content": "Hello!"}],
stream=True,
)
for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content and not first_token_time:
first_token_time = time.time() - start
print(f"Time to first token: {first_token_time:.3f}s")
# Check latency in usage logs via /api/v1/usage
# Each log entry includes: latency_ms, throughput (tokens/sec)# Use provider.sort for automatic latency optimization
response = client.chat.completions.create(
model="openai/gpt-4o",
messages=[{"role": "user", "content": "Hello!"}],
extra_body={
"provider": {
"sort": "latency", # Always pick lowest-latency provider
}
},
)可用性優化
BazaarLink 透過多層機制最大化 API 可用性,包括自動故障轉移、熔斷器和供應商健康監控。
可用性機制
- 熔斷器:自動偵測並隔離故障供應商
- 自動故障轉移:無縫切換至備用供應商,無需修改程式碼
- 供應商健康監控:持續追蹤各供應商的錯誤率和延遲
- 重試邏輯:暫時性錯誤(5xx)自動重試
熔斷器配置
# BazaarLink handles failover automatically — no code changes needed.
# Configure fallback models for maximum resilience:
response = client.chat.completions.create(
model="openai/gpt-4o", # Primary model
messages=[{"role": "user", "content": "Hello!"}],
extra_body={
"models": [ # Fallback chain
"openai/gpt-4o",
"anthropic/claude-3.5-sonnet",
"google/gemini-2.0-flash-001",
],
"route": "fallback", # Enable fallback routing
},
)
# Check if failover was used (in usage logs)
# "is_failover": true indicates the primary provider was bypassed# Check provider health (admin only)
GET https://bazaarlink.ai/api/admin/provider-health
Authorization: Bearer sk-bl-ADMIN_KEY
# Response
{
"providers": [
{
"id": "provider-1",
"name": "Anthropic",
"status": "healthy",
"error_rate": 0.002,
"avg_latency_ms": 145,
"circuit_open": false
}
]
}錯誤代碼
BazaarLink 使用標準 HTTP 狀態碼。錯誤回應遵循 OpenAI 格式:
{
"error": {
"message": "Invalid or disabled API key.",
"type": "invalid_request_error",
"code": 401
}
}錯誤處理
from openai import OpenAI, APIError, RateLimitError
client = OpenAI(
base_url="https://bazaarlink.ai/api/v1",
api_key="sk-bl-YOUR_API_KEY",
)
try:
response = client.chat.completions.create(
model="openai/gpt-4.1",
messages=[{"role": "user", "content": "Hello!"}],
)
except RateLimitError:
print("Rate limited — waiting before retry...")
except APIError as e:
print(f"API error {e.status_code}: {e.message}")串流錯誤格式
在任何 token 串流之前發生的錯誤,會以標準 HTTP 錯誤回應(JSON body)回傳。
串流過程中發生的錯誤,會以 SSE 事件形式傳送,finish_reason 為 "error"。請解析 delta 中的 error 欄位。
// Error chunk sent mid-stream (finish_reason: "error")
type MidStreamError = {
choices: [
{
index: 0;
finish_reason: "error";
delta: { content: "" };
native_finish_reason: null;
error: {
code: number;
message: string;
metadata?: {
provider_name?: string;
raw?: unknown;
};
};
}
];
};除錯
設定 debug.echo_upstream_body: true 可檢視實際傳送給上游 provider 的請求 body。轉換後的請求會作為第一個 SSE chunk 回傳。僅供開發與除錯使用,請勿在正式環境使用。
// Request with debug enabled (streaming only)
{
"model": "openai/gpt-4.1",
"messages": [{ "role": "user", "content": "Hello" }],
"stream": true,
"debug": { "echo_upstream_body": true }
}