Use Case Evaluations
Translation Evaluation
Recipes
- Evaluations
- Applications
- Datasets
- Inference
Use Case Evaluations
Translation Evaluation
Create and evaluate a translation application
Initialize the SGP Client and setup translation test data.
from scale_gp import SGPClient
from uuid import uuid4
client = SGPClient(base_url="https://api.egp.scale.com")
# Test data for translation
test_data = [
{
"origin_text": "Artificial intelligence (AI) is the simulation...",
"language": "Spanish",
"expected_translation": "La inteligencia artificial (IA) es la si..."
},
# Additional test data...
]
Define translation test cases and create the dataset.
from scale_gp.lib.dataset_builder import DatasetBuilder
test_cases = []
for data in test_data:
tc = TranslationTestCaseSchema(
origin_text=data["origin_text"],
language=data["language"],
expected_translation=data["expected_translation"]
)
test_cases.append(tc)
# Dataset creation
dataset = DatasetBuilder(client).initialize(
account_id="account_id_placeholder",
name=f"translation Dataset {uuid4()}",
test_cases=test_cases
)
print(dataset)
Implement a custom translation application for the evaluation.
from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
def my_translation_app(prompt, test_case):
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_translation": "Sample Translation HERE"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT TEXT"
},
"operation_output": {
"formatted_document": "EXAMPLE INPUT TEXT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
# Initialize application
app = ExternalApplication(client)
app.initialize(application_variant_id="variant_id_placeholder", application=my_translation_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
Create evaluation questions and setup evaluation configuration.
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Does the translation have punctuation issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
# Additional questions...
]
question_ids = []
for question in question_requests:
q = client.questions.create(**question)
question_ids.append(q.id)
print(q)
q_set = client.question_sets.create(
name="translation question set",
question_ids=question_ids,
account_id="account_id_placeholder"
)
print(q_set)
config = client.evaluation_configs.create(
account_id="account_id_placeholder",
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
Set up annotation configuration and start the evaluation.
from scale_gp.types import TranslationAnnotationConfigParam
from scale_gp.lib.types import data_locator
annotation_config_dict = TranslationAnnotationConfigParam(
original_text_loc=data_locator.test_case_data.input["origin_text"],
translation_loc=data_locator.test_case_output.output["generated_translation"],
expected_translation_loc=data_locator.test_case_data.expected_output["expected_translation"],
)
evaluation = client.evaluations.create(
account_id="account_id_placeholder",
application_variant_id="variant_id_placeholder",
application_spec_id="spec_id_placeholder",
description="Demo Evaluation",
name="Translation Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)