my-evals.js
Copy
Ask AI
import { createApp } from 'claudeye';
const app = createApp();
// Skip empty sessions globally
app.condition(({ entries }) => entries.length > 0);
// ── Turn count ───────────────────────────────────────────────────
// More than 50 turns usually means the agent is looping or lost.
app.eval('under-50-turns', ({ stats }) => ({
pass: stats.turnCount <= 50,
score: Math.max(0, 1 - stats.turnCount / 100),
message: `${stats.turnCount} turn(s)`,
}));
// ── Tool success rate ────────────────────────────────────────────
// Flag sessions where more than 10% of tool calls returned errors.
app.eval('tool-success-rate', ({ entries }) => {
const toolResults = entries.filter(e =>
e.type === 'user' &&
Array.isArray(e.message?.content) &&
e.message.content.some(b => b.type === 'tool_result')
);
const errors = toolResults.filter(e =>
e.message?.content?.some(b => b.is_error === true)
);
const rate = toolResults.length > 0
? 1 - (errors.length / toolResults.length)
: 1;
return {
pass: rate >= 0.9,
score: rate,
message: `${errors.length}/${toolResults.length} tool errors`,
};
});
// ── Session completion ───────────────────────────────────────────
// A complete session ends with an assistant text response.
app.eval('has-completion', ({ entries }) => {
const last = [...entries].reverse().find(e => e.type === 'assistant');
const hasText = last?.message?.content?.some?.(b => b.type === 'text');
return {
pass: !!hasText,
score: hasText ? 1.0 : 0,
message: hasText ? 'Ended with text response' : 'No final text response',
};
});
// ── Session-level tool count ─────────────────────────────────────
app.eval('session-tool-count', ({ entries }) => {
const sessionTools = entries
.filter(e => e._source === 'session' && e.type === 'assistant')
.flatMap(e => (e.message?.content || []).filter(b => b.type === 'tool_use'));
return {
pass: sessionTools.length <= 100,
score: Math.max(0, 1 - sessionTools.length / 200),
message: `${sessionTools.length} session-level tool calls`,
};
});
// ── Enrichments ──────────────────────────────────────────────────
app.enrich('session-overview', ({ stats }) => ({
'Turns': stats.turnCount,
'Tool Calls': stats.toolCallCount,
'Subagents': stats.subagentCount,
'Duration': stats.duration,
'Models': stats.models.join(', ') || 'none',
}));
app.listen();
Copy
Ask AI
claudeye --evals ./my-evals.js

