/
evaluation_metrics.py
30 lines (25 loc) · 1.17 KB
/
evaluation_metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# Custom faithfulness score
def calculate_faithfulness(answer, source_documents):
source_sentences = [sent.strip() for doc in source_documents for sent in doc.split('.')]
answer_sentences = answer.split('.')
faithful_score = sum(any(ans_sent in src_sent for src_sent in source_sentences)
for ans_sent in answer_sentences) / len(answer_sentences)
return faithful_score
# Using similarity - did not work because the transformer model wasn't loading properly
# from transformers import AutoModelForSequenceClassification, AutoTokenizer
# import torch
#
# # Load a pre-trained model and tokenizer for semantic similarity
# model_name = "sentence-transformers/all-MiniLM-L6-v2"
# model = AutoModelForSequenceClassification.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
#
#
# def calculate_faithfulness(answer, source_documents):
# source_text = ' '.join(source_documents)
# inputs = tokenizer(answer, source_text, return_tensors="pt", padding=True, truncation=True)
# with torch.no_grad():
# outputs = model(**inputs)
# similarity = torch.sigmoid(outputs.logits).item()
#
# return similarity