-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathevaluator.py
More file actions
78 lines (61 loc) · 2.71 KB
/
evaluator.py
File metadata and controls
78 lines (61 loc) · 2.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import asyncio
from metrics.answer_correctness import evaluate as eval_answer_correctness
from metrics.hallucination import evaluate as eval_hallucination
from metrics.entity_accuracy import evaluate as eval_entity_accuracy
from metrics.date_accuracy import evaluate as eval_date_accuracy
from metrics.refusal_correctness import evaluate as eval_refusal_correctness
async def _evaluate_async(document_id, document_text, samples, run_agent):
"""Evaluate each sample using separate LLM metric evaluators."""
results = []
print("\nRunning Evaluation\n")
for sample in samples:
question = sample.get("question", "")
expected = sample.get("expected_answer", "")
if document_id:
result = run_agent(document_id, question)
else:
result = run_agent(document_text, question)
if isinstance(result, dict):
predicted = result.get("answer", "")
else:
predicted = result
(
answer_result,
hallucination_result,
entity_result,
date_result,
refusal_result,
) = await asyncio.gather(
eval_answer_correctness(predicted, expected, document_text),
eval_hallucination(predicted, document_text),
eval_entity_accuracy(predicted, expected, document_text),
eval_date_accuracy(predicted, expected, document_text),
eval_refusal_correctness(predicted, expected, document_text),
)
metrics = {
"answer_correctness": answer_result,
"hallucination": hallucination_result,
"entity_accuracy": entity_result,
"date_accuracy": date_result,
"refusal_correctness": refusal_result,
}
metric_scores = [m.get("score", 0.0) for m in metrics.values() if isinstance(m, dict)]
sample_score = sum(metric_scores) / len(metric_scores) if metric_scores else 0.0
print(f"Q: {question}")
print(f"Expected: {expected}")
print(f"Predicted: {predicted}")
print(f"Sample score: {sample_score}\n")
results.append({
"question": question,
"expected_answer": expected,
"predicted_answer": predicted,
"sample_score": sample_score,
"metrics": metrics,
})
overall_score = 0.0
if results:
overall_score = sum(r.get("sample_score", 0.0) for r in results) / len(results)
return {"overall_score": overall_score, "results": results}
def evaluate(documentId, document_text, samples, run_agent):
"""Synchronous entrypoint for running evaluation from CLI/API."""
return asyncio.run(_evaluate_async(documentId, document_text, samples, run_agent))