Evaluation API Reference
The Evaluation API enables you to create, manage, and execute evaluation suites to test your AI peers' performance.
Important: Evaluation suites are workspace-level resources that are independent of specific peers. When running an evaluation, you associate a suite with a peer to test.
This API supports comprehensive testing workflows including question management, evaluator configuration, and result analysis.
Base URL
https://api.cognipeer.com/api/v1/evaluationAuthentication
All API requests require authentication using an API key:
Authorization: Bearer YOUR_API_KEYEvaluation Suites
Evaluation suites are workspace-level resources that can be used to test any peer.
Create Evaluation Suite
Create a new evaluation suite. The suite is not tied to a specific peer.
POST /api/v1/evaluation/suiteRequest Body:
{
"name": "Customer Support Quality Test",
"description": "Tests support peer responses for accuracy and helpfulness",
"evaluatorConfig": {
"exactMatch": {
"enabled": true,
"config": {
"caseSensitive": false,
"ignoreWhitespace": true
}
},
"llmJudge": {
"enabled": true,
"config": {
"modelId": "chatgpt-4o-mini",
"criteria": ["accuracy", "helpfulness", "clarity"],
"strictness": "moderate"
}
},
"semanticSimilarity": {
"enabled": true,
"config": {
"threshold": 0.8,
"modelId": "text-embedding-3-small"
}
}
},
"tags": ["support", "quality", "v1"]
}
"exactMatch": {
"enabled": true,
"config": {
"caseSensitive": false,
"ignoreWhitespace": true
}
},
"llmJudge": {
"enabled": true,
"config": {
"modelId": "chatgpt-4o-mini",
"criteria": ["accuracy", "helpfulness", "clarity"],
"strictness": "moderate"
}
},
"semanticSimilarity": {
"enabled": true,
"config": {
"threshold": 0.8,
"modelId": "text-embedding-3-small"
}
}
},
"tags": ["support", "quality", "v1"]
}Response:
{
"success": true,
"data": {
"id": "suite_xyz789",
"name": "Customer Support Quality Test",
"description": "Tests support peer responses for accuracy and helpfulness",
"evaluatorConfig": { /* ... */ },
"tags": ["support", "quality", "v1"],
"totalQuestions": 0,
"createdAt": "2025-10-20T10:30:00Z",
"updatedAt": "2025-10-20T10:30:00Z"
}
}Note: Evaluation suites are workspace-level and not tied to a specific peer. You select the peer when running the evaluation.
List Evaluation Suites
Get all evaluation suites in your workspace.
GET /api/v1/evaluation?peerId=peer_abc123&page=1&limit=20Query Parameters:
| Parameter | Type | Description |
|---|---|---|
| peerId | string | Filter by peer ID |
| page | number | Page number (default: 1) |
| limit | number | Results per page (default: 20, max: 100) |
| search | string | Search in suite name/description |
Response:
{
"success": true,
"data": {
"suites": [
{
"id": "suite_xyz789",
"name": "Customer Support Quality Test",
"peerId": "peer_abc123",
"questionCount": 25,
"lastRunAt": "2025-10-20T09:00:00Z",
"averageScore": 0.85
}
],
"pagination": {
"page": 1,
"limit": 20,
"total": 5,
"pages": 1
}
}
}Get Evaluation Suite
Retrieve details of a specific evaluation suite.
GET /api/v1/evaluation/:suiteIdResponse:
{
"success": true,
"data": {
"id": "suite_xyz789",
"name": "Customer Support Quality Test",
"description": "Tests support peer responses for accuracy and helpfulness",
"peerId": "peer_abc123",
"evaluators": { /* ... */ },
"questionCount": 25,
"questions": [ /* ... */ ],
"runs": [ /* recent runs */ ],
"createdAt": "2025-10-20T10:30:00Z",
"updatedAt": "2025-10-20T10:30:00Z"
}
}Update Evaluation Suite
Update an existing evaluation suite.
PUT /api/v1/evaluation/:suiteIdRequest Body:
{
"name": "Updated Suite Name",
"description": "Updated description",
"evaluators": { /* updated evaluator config */ }
}Delete Evaluation Suite
Delete an evaluation suite and all associated data.
DELETE /api/v1/evaluation/:suiteIdResponse:
{
"success": true,
"message": "Evaluation suite deleted successfully"
}Clone Evaluation Suite
Create a copy of an existing evaluation suite.
POST /api/v1/evaluation/:suiteId/cloneRequest Body:
{
"name": "Cloned Suite Name",
"peerId": "peer_abc123" // Optional: assign to different peer
}Questions
Add Question
Add a single question to an evaluation suite.
POST /api/v1/evaluation/:suiteId/questionsRequest Body:
{
"question": "What are your business hours?",
"expectedAnswer": "We are open Monday-Friday, 9 AM to 6 PM EST.",
"context": "", // Optional
"tags": ["support", "hours"], // Optional
"metadata": {} // Optional
}Response:
{
"success": true,
"data": {
"id": "question_123",
"question": "What are your business hours?",
"expectedAnswer": "We are open Monday-Friday, 9 AM to 6 PM EST.",
"context": "",
"tags": ["support", "hours"],
"metadata": {},
"createdAt": "2025-10-20T10:35:00Z"
}
}Import Questions
Bulk import questions from CSV or JSON.
POST /api/v1/evaluation/:suiteId/questions/importRequest Body (CSV):
{
"format": "csv",
"data": "base64_encoded_csv_content"
}CSV Format:
question,expectedAnswer,context,tags
"What are your hours?","Mon-Fri 9-6 EST","","support,hours"Request Body (JSON):
{
"format": "json",
"data": {
"questions": [
{
"question": "What are your hours?",
"expectedAnswer": "Mon-Fri 9-6 EST",
"context": "",
"tags": ["support", "hours"]
}
]
}
}Response:
{
"success": true,
"data": {
"imported": 25,
"failed": 0,
"errors": []
}
}List Questions
Get all questions in an evaluation suite.
GET /api/v1/evaluation/:suiteId/questions?page=1&limit=50Response:
{
"success": true,
"data": {
"questions": [
{
"id": "question_123",
"question": "What are your hours?",
"expectedAnswer": "Mon-Fri 9-6 EST",
"tags": ["support", "hours"]
}
],
"pagination": {
"page": 1,
"limit": 50,
"total": 25,
"pages": 1
}
}
}Update Question
Update an existing question.
PUT /api/v1/evaluation/:suiteId/questions/:questionIdDelete Question
Remove a question from the suite.
DELETE /api/v1/evaluation/:suiteId/questions/:questionIdEvaluation Runs
Evaluation runs associate a suite with a specific peer for testing.
Execute Evaluation
Run an evaluation suite against a selected peer.
POST /api/v1/evaluation/runRequest Body:
{
"evaluationSuiteId": "suite_xyz789", // Required: suite to run
"peerId": "peer_abc123", // Required: peer to test
"peerVersion": 3, // Optional: specific version
"description": "Weekly quality check", // Optional
"sampleSize": null // Optional: limit number of questions
}Important: The peerId is required when running an evaluation. This allows you to test the same suite against different peers or versions.
Response:
{
"success": true,
"data": {
"runId": "run_456",
"evaluationSuiteId": "suite_xyz789",
"peerId": "peer_abc123",
"peerVersion": 3,
"status": "running",
"startedAt": "2025-10-20T11:00:00Z",
"progress": {
"total": 25,
"completed": 0,
"percentage": 0
}
}
}Get Run Status
Check the status of a running evaluation.
GET /api/v1/evaluation/run/:runIdResponse:
{
"success": true,
"data": {
"id": "run_456",
"suiteId": "suite_xyz789",
"status": "completed",
"startedAt": "2025-10-20T11:00:00Z",
"completedAt": "2025-10-20T11:05:32Z",
"progress": 100,
"totalQuestions": 25,
"processedQuestions": 25,
"results": {
"averageScore": 0.85,
"passRate": 0.88,
"evaluatorScores": {
"exactMatch": 0.72,
"llmJudge": 0.90,
"semanticSimilarity": 0.92
}
}
}
}Get Run Results
Get detailed results for an evaluation run.
GET /api/v1/evaluation/:suiteId/runs/:runId/results?page=1&limit=50Response:
{
"success": true,
"data": {
"results": [
{
"questionId": "question_123",
"question": "What are your hours?",
"expectedAnswer": "Mon-Fri 9-6 EST",
"actualAnswer": "Our business hours are Monday through Friday, 9 AM to 6 PM Eastern Time.",
"scores": {
"exactMatch": 0.0,
"llmJudge": 0.95,
"semanticSimilarity": 0.98
},
"passed": true,
"evaluatorDetails": {
"llmJudge": {
"reasoning": "The answer is accurate and complete, providing the same information in a clear, professional manner."
}
}
}
],
"pagination": { /* ... */ }
}
}List Runs
Get all evaluation runs for a suite.
GET /api/v1/evaluation/:suiteId/runs?page=1&limit=20AI Analysis
Request Analysis
Get AI-powered improvement suggestions based on evaluation results.
POST /api/v1/evaluation/:runId/suggest-improvementsRequest Body (Optional):
{
"focus": "accuracy", // or "speed", "cost", "tone"
"constraints": {
"maxTemperature": 0.5,
"preferredTools": ["datasource"],
"maintainTone": true
}
}Response:
{
"success": true,
"data": {
"analysis": {
"summary": "Your peer is struggling with customer support scenarios...",
"overallImprovement": "+15-20%",
"suggestions": [
{
"id": "sugg_1",
"category": "prompt",
"priority": "high",
"title": "Enhance System Prompt for Product Support",
"description": "Add product support guidelines to improve accuracy...",
"changes": {
"type": "prompt",
"action": "append",
"content": "When discussing products: 1. Always reference..."
},
"expectedImpact": "+12%"
}
]
}
}
}Apply Improvements
Apply AI suggestions to a peer.
POST /api/v1/peer/:peerId/apply-improvementsRequest Body:
{
"suggestions": ["sugg_1", "sugg_2"],
"preview": false
}Response:
{
"success": true,
"data": {
"applied": 2,
"changes": {
"prompt": "Updated system prompt content...",
"tools": ["datasource-tool-id"],
"settings": {
"temperature": 0.3
}
},
"backup": { /* previous configuration */ }
}
}WebSocket Events
Subscribe to real-time evaluation progress updates.
Connect
const socket = io('https://api.cognipeer.com', {
auth: { token: 'YOUR_API_KEY' }
});Events
evaluation.progress
Fired during evaluation execution:
socket.on('evaluation.progress', (data) => {
console.log(data);
// {
// runId: 'run_456',
// progress: 45,
// processedQuestions: 11,
// totalQuestions: 25,
// currentScore: 0.82
// }
});evaluation.completed
Fired when evaluation finishes:
socket.on('evaluation.completed', (data) => {
console.log(data);
// {
// runId: 'run_456',
// status: 'completed',
// results: { /* summary */ }
// }
});evaluation.failed
Fired if evaluation fails:
socket.on('evaluation.failed', (data) => {
console.log(data);
// {
// runId: 'run_456',
// error: 'Error message'
// }
});Error Responses
Common Errors
400 Bad Request
{
"success": false,
"error": "Validation failed",
"details": {
"field": "evaluators.llmJudge.config.modelId",
"message": "Invalid model ID"
}
}404 Not Found
{
"success": false,
"error": "Evaluation suite not found"
}429 Too Many Requests
{
"success": false,
"error": "Rate limit exceeded",
"retryAfter": 60
}Rate Limits
| Endpoint | Limit |
|---|---|
| Create/Update Suite | 60/hour |
| Import Questions | 20/hour |
| Execute Evaluation | 30/hour |
| Get Results | 300/hour |
Best Practices
- Batch Operations: Use import for adding multiple questions
- WebSocket for Progress: Subscribe to real-time updates for long-running evaluations
- Pagination: Use pagination for large result sets
- Error Handling: Always check
successfield and handle errors gracefully - Rate Limiting: Implement exponential backoff for rate limit errors
Related Documentation
- Evaluation Guide - Comprehensive evaluation system guide
- AI Analysis Guide - AI-powered optimization
- Authentication - API authentication
- Peer API - Peer management API
Code Examples
Complete Evaluation Workflow
const axios = require('axios');
const API_KEY = 'your_api_key';
const BASE_URL = 'https://api.cognipeer.com/api/v1';
async function runEvaluationWorkflow() {
// 1. Create evaluation suite
const suite = await axios.post(`${BASE_URL}/evaluation`, {
name: 'Support Quality Test',
peerId: 'peer_123',
evaluators: {
llmJudge: {
enabled: true,
config: { modelId: 'chatgpt-4o-mini' }
}
}
}, {
headers: { Authorization: `Bearer ${API_KEY}` }
});
const suiteId = suite.data.data.id;
// 2. Import questions
await axios.post(`${BASE_URL}/evaluation/${suiteId}/questions/import`, {
format: 'json',
data: {
questions: [
{
question: 'What are your hours?',
expectedAnswer: 'Mon-Fri 9-6 EST'
}
]
}
}, {
headers: { Authorization: `Bearer ${API_KEY}` }
});
// 3. Run evaluation
const run = await axios.post(`${BASE_URL}/evaluation/${suiteId}/run`, {}, {
headers: { Authorization: `Bearer ${API_KEY}` }
});
const runId = run.data.data.runId;
// 4. Poll for completion
let status = 'running';
while (status === 'running') {
await new Promise(resolve => setTimeout(resolve, 5000));
const statusRes = await axios.get(
`${BASE_URL}/evaluation/${suiteId}/runs/${runId}`,
{ headers: { Authorization: `Bearer ${API_KEY}` } }
);
status = statusRes.data.data.status;
}
// 5. Get results
const results = await axios.get(
`${BASE_URL}/evaluation/${suiteId}/runs/${runId}/results`,
{ headers: { Authorization: `Bearer ${API_KEY}` } }
);
console.log('Evaluation completed:', results.data);
// 6. Request AI analysis
const analysis = await axios.post(
`${BASE_URL}/evaluation/${runId}/suggest-improvements`,
{},
{ headers: { Authorization: `Bearer ${API_KEY}` } }
);
console.log('AI Suggestions:', analysis.data);
}
runEvaluationWorkflow();
