Skip to content

Commit 845bfb7

Browse files
Raga's Evaluation For Multi Modes (#806)
* Updatedmodels for ragas eval * context utilization metrics removed * updated supported llms for ragas * removed context utilization * Implemented Parallel API * multi api calls error resolved * MultiMode Metrics * Fix: Metric Evalution For Single Mode * multi modes ragas evaluation * api payload changes * metric api output format changed * multi mode ragas changes * removed pre process dataset * api response changes * Multimode metrics api integration * nan error for no answer resolved * QA integration changes --------- Co-authored-by: kaustubh-darekar <[email protected]>
1 parent 3d587f0 commit 845bfb7

File tree

11 files changed

+378
-187
lines changed

11 files changed

+378
-187
lines changed

backend/score.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -786,24 +786,34 @@ async def retry_processing(uri=Form(), userName=Form(), password=Form(), databas
786786
gc.collect()
787787

788788
@app.post('/metric')
789-
async def calculate_metric(question=Form(), context=Form(), answer=Form(), model=Form()):
789+
async def calculate_metric(question: str = Form(),
790+
context: str = Form(),
791+
answer: str = Form(),
792+
model: str = Form(),
793+
mode: str = Form()):
790794
try:
791-
payload_json_obj = {'api_name':'metric', 'context':context, 'answer':answer, 'model':model, 'logging_time': formatted_time(datetime.now(timezone.utc))}
792-
logger.log_struct(payload_json_obj, "INFO")
793-
result = await asyncio.to_thread(get_ragas_metrics, question, context, answer, model)
794-
if result is None or "error" in result:
795-
return create_api_response(
796-
'Failed',
797-
message='Failed to calculate evaluation metrics.',
798-
error=result.get("error", "Ragas evaluation returned null")
799-
)
800-
return create_api_response('Success', data=result)
795+
context_list = [str(item).strip() for item in json.loads(context)] if context else []
796+
answer_list = [str(item).strip() for item in json.loads(answer)] if answer else []
797+
mode_list = [str(item).strip() for item in json.loads(mode)] if mode else []
798+
799+
result = await asyncio.to_thread(
800+
get_ragas_metrics, question, context_list, answer_list, model
801+
)
802+
if result is None or "error" in result:
803+
return create_api_response(
804+
'Failed',
805+
message='Failed to calculate evaluation metrics.',
806+
error=result.get("error", "Ragas evaluation returned null")
807+
)
808+
data = {mode: {metric: result[metric][i] for metric in result} for i, mode in enumerate(mode_list)}
809+
return create_api_response('Success', data=data)
801810
except Exception as e:
802-
job_status = "Failed"
803-
message = "Error while calculating evaluation metrics"
804-
error_message = str(e)
805-
logging.exception(f'{error_message}')
806-
return create_api_response(job_status, message=message, error=error_message)
811+
logging.exception(f"Error while calculating evaluation metrics: {e}")
812+
return create_api_response(
813+
'Failed',
814+
message="Error while calculating evaluation metrics",
815+
error=str(e)
816+
)
807817
finally:
808818
gc.collect()
809819

backend/src/QA_integration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,7 +435,7 @@ def process_chat_response(messages, history, question, model, graph, document_na
435435
total_tokens = 0
436436
formatted_docs = ""
437437

438-
question = transformed_question if transformed_question else question
438+
# question = transformed_question if transformed_question else question
439439
# metrics = get_ragas_metrics(question,formatted_docs,content)
440440
# print(metrics)
441441

backend/src/ragas_eval.py

Lines changed: 17 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,96 +1,52 @@
11
import os
22
import logging
33
import time
4-
from typing import Dict, Tuple, Optional
5-
import boto3
4+
from src.llm import get_llm
65
from datasets import Dataset
76
from dotenv import load_dotenv
8-
from langchain_anthropic import ChatAnthropic
9-
from langchain_aws import ChatBedrock
10-
from langchain_community.chat_models import ChatOllama
11-
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
12-
from langchain_fireworks import ChatFireworks
13-
from langchain_google_vertexai import (
14-
ChatVertexAI,
15-
HarmBlockThreshold,
16-
HarmCategory,
17-
)
18-
from langchain_groq import ChatGroq
19-
from langchain_openai import AzureChatOpenAI, ChatOpenAI
207
from ragas import evaluate
21-
from ragas.metrics import answer_relevancy, context_utilization, faithfulness
8+
from ragas.metrics import answer_relevancy, faithfulness
229
from src.shared.common_fn import load_embedding_model
23-
2410
load_dotenv()
2511

26-
RAGAS_MODEL_VERSIONS = {
27-
"openai_gpt_3.5": "gpt-3.5-turbo-16k",
28-
"openai_gpt_4": "gpt-4-turbo-2024-04-09",
29-
"openai_gpt_4o_mini": "gpt-4o-mini-2024-07-18",
30-
"openai_gpt_4o": "gpt-4o-mini-2024-07-18",
31-
"groq_llama3_70b": "groq_llama3_70b",
32-
}
3312
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL")
3413
EMBEDDING_FUNCTION, _ = load_embedding_model(EMBEDDING_MODEL)
3514

36-
37-
def get_ragas_llm(model: str) -> Tuple[object, str]:
38-
"""Retrieves the specified language model. Improved error handling and structure."""
39-
env_key = f"LLM_MODEL_CONFIG_{model}"
40-
env_value = os.environ.get(env_key)
41-
logging.info(f"Loading model configuration: {env_key}")
42-
try:
43-
if "openai" in model:
44-
model_name = RAGAS_MODEL_VERSIONS[model]
45-
llm = ChatOpenAI(
46-
api_key=os.environ.get("OPENAI_API_KEY"), model=model_name, temperature=0
47-
)
48-
elif "groq" in model:
49-
model_name, base_url, api_key = env_value.split(",")
50-
llm = ChatGroq(api_key=api_key, model_name=model_name, temperature=0)
51-
else:
52-
raise ValueError(f"Unsupported model for evaluation: {model}")
53-
54-
logging.info(f"Model loaded - Model Version: {model}")
55-
return llm, model_name
56-
except (ValueError, KeyError) as e:
57-
logging.error(f"Error loading LLM: {e}")
58-
raise
59-
60-
61-
def get_ragas_metrics(
62-
question: str, context: str, answer: str, model: str
63-
) -> Optional[Dict[str, float]]:
15+
def get_ragas_metrics(question: str, context: list, answer: list, model: str):
6416
"""Calculates RAGAS metrics."""
6517
try:
6618
start_time = time.time()
6719
dataset = Dataset.from_dict(
68-
{"question": [question], "answer": [answer], "contexts": [[context]]}
20+
{"question": [question] * len(answer), "answer": answer, "contexts": [[ctx] for ctx in context]}
6921
)
70-
logging.info("Dataset created successfully.")
71-
72-
llm, model_name = get_ragas_llm(model=model)
22+
logging.info("Evaluation dataset created successfully.")
23+
if ("diffbot" in model) or ("ollama" in model):
24+
raise ValueError(f"Unsupported model for evaluation: {model}")
25+
else:
26+
llm, model_name = get_llm(model=model)
27+
7328
logging.info(f"Evaluating with model: {model_name}")
74-
29+
7530
score = evaluate(
7631
dataset=dataset,
77-
metrics=[faithfulness, answer_relevancy, context_utilization],
32+
metrics=[faithfulness, answer_relevancy],
7833
llm=llm,
7934
embeddings=EMBEDDING_FUNCTION,
8035
)
81-
36+
8237
score_dict = (
83-
score.to_pandas()[["faithfulness", "answer_relevancy", "context_utilization"]]
38+
score.to_pandas()[["faithfulness", "answer_relevancy"]]
39+
.fillna(0)
8440
.round(4)
85-
.to_dict(orient="records")[0]
41+
.to_dict(orient="list")
8642
)
8743
end_time = time.time()
8844
logging.info(f"Evaluation completed in: {end_time - start_time:.2f} seconds")
8945
return score_dict
9046
except ValueError as e:
9147
if "Unsupported model for evaluation" in str(e):
9248
logging.error(f"Unsupported model error: {e}")
93-
return {"error": str(e)} # Return the specific error message as a dictionary
49+
return {"error": str(e)}
9450
logging.exception(f"ValueError during metrics evaluation: {e}")
9551
return {"error": str(e)}
9652
except Exception as e:

0 commit comments

Comments
 (0)