Use Case Evaluations
Multiturn Evaluation
Recipes
- Evaluations
- Applications
- Datasets
- Inference
Use Case Evaluations
Multiturn Evaluation
Create and evaluate a multiturn application
Set up the SGP Client for communication with the API:
from scale_gp import SGPClient
client = SGPClient(base_url="https://api.egp.scale.com")
Build and initialize a multiturn dataset with predefined messages:
message_data = [
{
"init_messages": [{"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"}],
},
{
"init_messages": [{"role": "user", "content": "How did Napoleon Bonaparte's rise to power impact French society and politics in the early 19th century?"}],
},
{
"init_messages": [{"role": "user", "content": "Analyze the economic and social consequences of French colonialism in North Africa during the 19th and early 20th centuries."}],
},
]
test_cases = [MultiturnTestCaseSchema(messages=data["init_messages"]) for data in message_data]
dataset = DatasetBuilder(client).initialize(
account_id=os.environ["SGP_ACCOUNT_ID"],
name=f"Multiturn Dataset {timestamp()}",
test_cases=test_cases
)
print(dataset)
Define the multiturn application with a simulated conversation output and initialize it:
conversation_data = [
{
"conversation": [
{"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"},
{"role": "assistant", "content": "The French Revolution of 1789 was the result of a complex interplay of social, economic, and political factors. Some key elements include...\n"}
],
},
# More conversation data entries...
]
def my_multiturn_app(prompt, test_case):
output = None
for c in conversation_data:
if c["conversation"][0] == prompt['messages'][0]:
output = c["conversation"]
break
start = datetime.now().replace(microsecond=5000)
traces = []
global_turn_counter = 1
for i in range(1, len(output), 2):
prev_convo = output[:i]
assistant_message = output[i]["content"]
trace = {
"node_id": f"Model Call #{global_turn_counter}",
"start_timestamp": str(start.isoformat()),
"operation_input": {"conversation_history": prev_convo},
"operation_output": {"response": assistant_message},
"duration_ms": random.randint(200, 600),
}
traces.append(trace)
global_turn_counter += 1
return ExternalApplicationOutputFlexible(
generation_output={"generated_conversation": output},
trace_spans=traces,
metrics={"grammar": round(random.random(), 3), "memory": round(random.random(), 3), "content": round(random.random(), 3)}
)
app = ExternalApplication(client)
app.initialize(application_variant_id=variant.id, application=my_multiturn_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
Prepare questions for evaluating the multiturn conversation:
question_requests = [
{
"type": "categorical",
"title": "Question 1",
"prompt": "Does the conversation make sense",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": os.environ["SGP_ACCOUNT_ID"],
},
{
"type": "categorical",
"title": "Question 2",
"prompt": "Is the user query answered correctly?",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": os.environ["SGP_ACCOUNT_ID"],
},
{
"type": "free_text",
"title": "Question 3",
"prompt": "If any messages are incorrect, list their turn number",
"account_id": os.environ["SGP_ACCOUNT_ID"],
},
]
question_ids = [client.questions.create(**question).id for question in question_requests]
q_set = client.question_sets.create(
name="multiturn question set",
question_ids=question_ids,
account_id=os.environ["SGP_ACCOUNT_ID"],
)
print(q_set)
Configure and initiate the evaluation process:
config = client.evaluation_configs.create(
account_id=os.environ["SGP_ACCOUNT_ID"],
question_set_id=q_set.id,
evaluation_type='human',
)
print(config)
annotation_config_dict = {
"messages_loc": data_locator.test_case_output.output["generated_conversation"]
}
evaluation = client.evaluations.create(
account_id=os.environ["SGP_ACCOUNT_ID"],
application_variant_id=variant.id,
application_spec_id=spec.id,
description="Demo Multiturn Evaluation",
name="Multiturn Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)