import { createApp } from 'claudeye';
import Anthropic from '@anthropic-ai/sdk';
const app = createApp();
const client = new Anthropic();
const JUDGE_MODEL = 'claude-haiku-4-5-20251001';
async function judge(systemPrompt, userPrompt) {
const response = await client.messages.create({
model: JUDGE_MODEL,
max_tokens: 256,
system: systemPrompt,
messages: [{ role: 'user', content: userPrompt }],
});
const text = response.content.find(b => b.type === 'text')?.text ?? '';
const scoreMatch = text.match(/SCORE:\s*(\d+(?:\.\d+)?)/i);
const reasonMatch = text.match(/REASON:\s*(.+)/is);
const raw = scoreMatch ? parseFloat(scoreMatch[1]) : 5;
return {
score: Math.min(1, Math.max(0, raw / 10)),
reason: reasonMatch ? reasonMatch[1].trim().split('\n')[0] : text.slice(0, 120),
};
}
function buildTranscript(entries) {
const parts = [];
for (const entry of entries) {
if (entry.type === 'user') {
const content = entry.message?.content;
const text = typeof content === 'string' ? content
: Array.isArray(content) ? content.find(b => b.type === 'text')?.text ?? ''
: '';
if (text) parts.push(`USER: ${text.slice(0, 400)}`);
} else if (entry.type === 'assistant') {
const content = entry.message?.content;
const text = typeof content === 'string' ? content
: Array.isArray(content) ? content.find(b => b.type === 'text')?.text ?? ''
: '';
if (text) parts.push(`ASSISTANT: ${text.slice(0, 400)}`);
}
}
return parts.join('\n').slice(0, 3000);
}
// ── Task completion ───────────────────────────────────────────────
app.eval('task-completion', async ({ entries }) => {
const transcript = buildTranscript(entries);
if (!transcript) return { pass: false, score: 0, message: 'Empty session' };
const { score, reason } = await judge(
'You are an eval judge. Rate how completely the assistant fulfilled the user\'s request.\nSCORE: <0-10>\nREASON: <one sentence>',
`Transcript:\n${transcript}`
);
return { pass: score >= 0.6, score, message: reason };
});
// ── Faithfulness ─────────────────────────────────────────────────
app.eval('faithfulness', async ({ entries }) => {
const hasToolResults = entries.some(e =>
e.type === 'user' &&
Array.isArray(e.message?.content) &&
e.message.content.some(b => b.type === 'tool_result')
);
if (!hasToolResults) {
return { pass: true, score: 1, message: 'No tool calls - faithfulness N/A' };
}
const transcript = buildTranscript(entries);
const { score, reason } = await judge(
'You are an eval judge checking for hallucination. Rate how faithfully the assistant\'s claims are supported by tool results. 10 = fully grounded, 0 = made up facts.\nSCORE: <0-10>\nREASON: <one sentence>',
`Transcript:\n${transcript}`
);
return { pass: score >= 0.7, score, message: reason };
});
// ── Answer quality ────────────────────────────────────────────────
app.eval('answer-quality', async ({ entries }) => {
const lastAssistant = [...entries].reverse().find(e => e.type === 'assistant');
const content = lastAssistant?.message?.content;
const finalResponse = typeof content === 'string' ? content
: Array.isArray(content) ? content.find(b => b.type === 'text')?.text ?? ''
: '';
if (!finalResponse.trim()) {
return { pass: false, score: 0, message: 'No final response' };
}
const firstUser = entries.find(e => e.type === 'user');
const userContent = firstUser?.message?.content;
const userRequest = typeof userContent === 'string' ? userContent
: Array.isArray(userContent) ? userContent.find(b => b.type === 'text')?.text ?? ''
: '';
const { score, reason } = await judge(
'Rate the quality of the assistant\'s final response. Consider relevance, correctness, clarity, and conciseness.\nSCORE: <0-10>\nREASON: <one sentence>',
`User request: ${userRequest.slice(0, 500)}\n\nFinal response: ${finalResponse.slice(0, 1000)}`
);
return { pass: score >= 0.6, score, message: reason };
});
app.listen();