Use Case Evaluations
Translation Evaluation
Recipes
- Evaluations
- Applications
- Datasets
- Inference
Use Case Evaluations
Translation Evaluation
Create and evaluate a translation application
import os
from uuid import uuid4
from datetime import datetime
from typing import List
import httpx
from scale_gp import SGPClient
from scale_gp.lib.types.translation import TranslationTestCaseSchema
from scale_gp.lib.dataset_builder import DatasetBuilder
from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
from scale_gp.types import TranslationAnnotationConfigParam
from scale_gp.lib.types import data_locator
# Initialize the client
client = SGPClient(base_url="https://api.egp.scale.com")
# Test data for translation
test_data = [
{
"origin_text": "Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.",
"language": "Spanish",
"expected_translation": "La inteligencia artificial (IA) es la simulación de procesos de inteligencia humana por máquinas, especialmente sistemas informáticos."
},
{
"origin_text": "Machine learning is a subset of AI that focuses on the development of computer programs that can access data and use it to learn for themselves.",
"language": "French",
"expected_translation": "L'apprentissage automatique est un sous-ensemble de l'IA qui se concentre sur le développement de programmes informatiques capables d'accéder aux données et de les utiliser pour apprendre par eux-mêmes."
},
{
"origin_text": "Natural Language Processing (NLP) is a branch of AI that helps computers understand, interpret, and manipulate human language.",
"language": "German",
"expected_translation": "Die Verarbeitung natürlicher Sprache (NLP) ist ein Zweig der KI, der Computern hilft, menschliche Sprache zu verstehen, zu interpretieren und zu manipulieren."
},
{
"origin_text": "Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.",
"language": "Italian",
"expected_translation": "Il deep learning fa parte di una più ampia famiglia di metodi di apprendimento automatico basati su reti neurali artificiali con apprendimento della rappresentazione."
},
{
"origin_text": "Robotics is a field of engineering that involves the design, construction, and operation of robots, often incorporating AI for decision-making and task execution.",
"language": "Portuguese",
"expected_translation": "A robótica é um campo da engenharia que envolve o design, construção e operação de robôs, frequentemente incorporando IA para tomada de decisões e execução de tarefas."
}
]
# Create test cases
test_cases = []
for data in test_data:
tc = TranslationTestCaseSchema(
origin_text=data["origin_text"],
language=data["language"],
expected_translation=data["expected_translation"]
)
test_cases.append(tc)
# Dataset creation
dataset = DatasetBuilder(client).initialize(
account_id="account_id_placeholder",
name=f"translation Dataset {uuid4()}",
test_cases=test_cases
)
print(dataset)
# Define external application
def my_translation_app(prompt, test_case):
print(prompt['origin_text'][:50])
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_translation": "Sample Translation HERE"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT TEXT"
},
"operation_output": {
"formatted_document": "EXAMPLE INPUT TEXT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
# Initialize application
app = ExternalApplication(client)
app.initialize(application_variant_id="variant_id_placeholder", application=my_translation_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
# Evaluation setup
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Does the translation have punctuation issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "categorical",
"title": "Test Question 2",
"prompt": "Does the translation have grammatical issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "free_text",
"title": "Test Question 3",
"prompt": "List all translation issues",
"account_id": "account_id_placeholder",
}
]
question_ids = []
for question in question_requests:
q = client.questions.create(**question)
question_ids.append(q.id)
print(q)
q_set = client.question_sets.create(
name="translation question set",
question_ids=question_ids,
account_id="account_id_placeholder"
)
print(q_set)
config = client.evaluation_configs.create(
account_id="account_id_placeholder",
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
annotation_config_dict = TranslationAnnotationConfigParam(
original_text_loc=data_locator.test_case_data.input["origin_text"],
translation_loc=data_locator.test_case_output.output["generated_translation"],
expected_translation_loc=data_locator.test_case_data.expected_output["expected_translation"],
)
evaluation = client.evaluations.create(
account_id="account_id_placeholder",
application_variant_id="variant_id_placeholder",
application_spec_id="spec_id_placeholder",
description="Demo Evaluation",
name="Translation Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)
EvaluationDataset(
id='32f3862e-75e1-4b69-ab08-638ae6ae3829',
account_id='f4b2a52e-29ff-4225-961e-378e23e67524',
created_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
created_by_user_id='6f655fda-0492-494b-bc1d-8d02bcb42c89',
name='translation Dataset 2024-10-17 20:29:30 3926b308-d14b-41c8-a53f-7511fb906d13',
schema_type='FLEXIBLE',
updated_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
archived_at=None,
evaluation_dataset_metadata=None,
knowledge_base_id=None,
out_of_date=None,
schema_sub_type=None,
vendor=None
)
Initialize the SGP Client and setup translation test data.
from scale_gp import SGPClient
from uuid import uuid4
client = SGPClient(base_url="https://api.egp.scale.com")
# Test data for translation
test_data = [
{
"origin_text": "Artificial intelligence (AI) is the simulation...",
"language": "Spanish",
"expected_translation": "La inteligencia artificial (IA) es la si..."
},
# Additional test data...
]
Define translation test cases and create the dataset.
from scale_gp.lib.dataset_builder import DatasetBuilder
test_cases = []
for data in test_data:
tc = TranslationTestCaseSchema(
origin_text=data["origin_text"],
language=data["language"],
expected_translation=data["expected_translation"]
)
test_cases.append(tc)
# Dataset creation
dataset = DatasetBuilder(client).initialize(
account_id="account_id_placeholder",
name=f"translation Dataset {uuid4()}",
test_cases=test_cases
)
print(dataset)
Implement a custom translation application for the evaluation.
from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
def my_translation_app(prompt, test_case):
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_translation": "Sample Translation HERE"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT TEXT"
},
"operation_output": {
"formatted_document": "EXAMPLE INPUT TEXT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
# Initialize application
app = ExternalApplication(client)
app.initialize(application_variant_id="variant_id_placeholder", application=my_translation_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
Create evaluation questions and setup evaluation configuration.
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Does the translation have punctuation issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
# Additional questions...
]
question_ids = []
for question in question_requests:
q = client.questions.create(**question)
question_ids.append(q.id)
print(q)
q_set = client.question_sets.create(
name="translation question set",
question_ids=question_ids,
account_id="account_id_placeholder"
)
print(q_set)
config = client.evaluation_configs.create(
account_id="account_id_placeholder",
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
Set up annotation configuration and start the evaluation.
from scale_gp.types import TranslationAnnotationConfigParam
from scale_gp.lib.types import data_locator
annotation_config_dict = TranslationAnnotationConfigParam(
original_text_loc=data_locator.test_case_data.input["origin_text"],
translation_loc=data_locator.test_case_output.output["generated_translation"],
expected_translation_loc=data_locator.test_case_data.expected_output["expected_translation"],
)
evaluation = client.evaluations.create(
account_id="account_id_placeholder",
application_variant_id="variant_id_placeholder",
application_spec_id="spec_id_placeholder",
description="Demo Evaluation",
name="Translation Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)
import os
from uuid import uuid4
from datetime import datetime
from typing import List
import httpx
from scale_gp import SGPClient
from scale_gp.lib.types.translation import TranslationTestCaseSchema
from scale_gp.lib.dataset_builder import DatasetBuilder
from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
from scale_gp.types import TranslationAnnotationConfigParam
from scale_gp.lib.types import data_locator
# Initialize the client
client = SGPClient(base_url="https://api.egp.scale.com")
# Test data for translation
test_data = [
{
"origin_text": "Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.",
"language": "Spanish",
"expected_translation": "La inteligencia artificial (IA) es la simulación de procesos de inteligencia humana por máquinas, especialmente sistemas informáticos."
},
{
"origin_text": "Machine learning is a subset of AI that focuses on the development of computer programs that can access data and use it to learn for themselves.",
"language": "French",
"expected_translation": "L'apprentissage automatique est un sous-ensemble de l'IA qui se concentre sur le développement de programmes informatiques capables d'accéder aux données et de les utiliser pour apprendre par eux-mêmes."
},
{
"origin_text": "Natural Language Processing (NLP) is a branch of AI that helps computers understand, interpret, and manipulate human language.",
"language": "German",
"expected_translation": "Die Verarbeitung natürlicher Sprache (NLP) ist ein Zweig der KI, der Computern hilft, menschliche Sprache zu verstehen, zu interpretieren und zu manipulieren."
},
{
"origin_text": "Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.",
"language": "Italian",
"expected_translation": "Il deep learning fa parte di una più ampia famiglia di metodi di apprendimento automatico basati su reti neurali artificiali con apprendimento della rappresentazione."
},
{
"origin_text": "Robotics is a field of engineering that involves the design, construction, and operation of robots, often incorporating AI for decision-making and task execution.",
"language": "Portuguese",
"expected_translation": "A robótica é um campo da engenharia que envolve o design, construção e operação de robôs, frequentemente incorporando IA para tomada de decisões e execução de tarefas."
}
]
# Create test cases
test_cases = []
for data in test_data:
tc = TranslationTestCaseSchema(
origin_text=data["origin_text"],
language=data["language"],
expected_translation=data["expected_translation"]
)
test_cases.append(tc)
# Dataset creation
dataset = DatasetBuilder(client).initialize(
account_id="account_id_placeholder",
name=f"translation Dataset {uuid4()}",
test_cases=test_cases
)
print(dataset)
# Define external application
def my_translation_app(prompt, test_case):
print(prompt['origin_text'][:50])
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_translation": "Sample Translation HERE"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT TEXT"
},
"operation_output": {
"formatted_document": "EXAMPLE INPUT TEXT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
# Initialize application
app = ExternalApplication(client)
app.initialize(application_variant_id="variant_id_placeholder", application=my_translation_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
# Evaluation setup
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Does the translation have punctuation issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "categorical",
"title": "Test Question 2",
"prompt": "Does the translation have grammatical issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "free_text",
"title": "Test Question 3",
"prompt": "List all translation issues",
"account_id": "account_id_placeholder",
}
]
question_ids = []
for question in question_requests:
q = client.questions.create(**question)
question_ids.append(q.id)
print(q)
q_set = client.question_sets.create(
name="translation question set",
question_ids=question_ids,
account_id="account_id_placeholder"
)
print(q_set)
config = client.evaluation_configs.create(
account_id="account_id_placeholder",
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
annotation_config_dict = TranslationAnnotationConfigParam(
original_text_loc=data_locator.test_case_data.input["origin_text"],
translation_loc=data_locator.test_case_output.output["generated_translation"],
expected_translation_loc=data_locator.test_case_data.expected_output["expected_translation"],
)
evaluation = client.evaluations.create(
account_id="account_id_placeholder",
application_variant_id="variant_id_placeholder",
application_spec_id="spec_id_placeholder",
description="Demo Evaluation",
name="Translation Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)
EvaluationDataset(
id='32f3862e-75e1-4b69-ab08-638ae6ae3829',
account_id='f4b2a52e-29ff-4225-961e-378e23e67524',
created_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
created_by_user_id='6f655fda-0492-494b-bc1d-8d02bcb42c89',
name='translation Dataset 2024-10-17 20:29:30 3926b308-d14b-41c8-a53f-7511fb906d13',
schema_type='FLEXIBLE',
updated_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
archived_at=None,
evaluation_dataset_metadata=None,
knowledge_base_id=None,
out_of_date=None,
schema_sub_type=None,
vendor=None
)
import os
from uuid import uuid4
from datetime import datetime
from typing import List
import httpx
from scale_gp import SGPClient
from scale_gp.lib.types.translation import TranslationTestCaseSchema
from scale_gp.lib.dataset_builder import DatasetBuilder
from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
from scale_gp.types import TranslationAnnotationConfigParam
from scale_gp.lib.types import data_locator
# Initialize the client
client = SGPClient(base_url="https://api.egp.scale.com")
# Test data for translation
test_data = [
{
"origin_text": "Artificial intelligence (AI) is the simulation of human intelligence processes by machines, especially computer systems.",
"language": "Spanish",
"expected_translation": "La inteligencia artificial (IA) es la simulación de procesos de inteligencia humana por máquinas, especialmente sistemas informáticos."
},
{
"origin_text": "Machine learning is a subset of AI that focuses on the development of computer programs that can access data and use it to learn for themselves.",
"language": "French",
"expected_translation": "L'apprentissage automatique est un sous-ensemble de l'IA qui se concentre sur le développement de programmes informatiques capables d'accéder aux données et de les utiliser pour apprendre par eux-mêmes."
},
{
"origin_text": "Natural Language Processing (NLP) is a branch of AI that helps computers understand, interpret, and manipulate human language.",
"language": "German",
"expected_translation": "Die Verarbeitung natürlicher Sprache (NLP) ist ein Zweig der KI, der Computern hilft, menschliche Sprache zu verstehen, zu interpretieren und zu manipulieren."
},
{
"origin_text": "Deep learning is part of a broader family of machine learning methods based on artificial neural networks with representation learning.",
"language": "Italian",
"expected_translation": "Il deep learning fa parte di una più ampia famiglia di metodi di apprendimento automatico basati su reti neurali artificiali con apprendimento della rappresentazione."
},
{
"origin_text": "Robotics is a field of engineering that involves the design, construction, and operation of robots, often incorporating AI for decision-making and task execution.",
"language": "Portuguese",
"expected_translation": "A robótica é um campo da engenharia que envolve o design, construção e operação de robôs, frequentemente incorporando IA para tomada de decisões e execução de tarefas."
}
]
# Create test cases
test_cases = []
for data in test_data:
tc = TranslationTestCaseSchema(
origin_text=data["origin_text"],
language=data["language"],
expected_translation=data["expected_translation"]
)
test_cases.append(tc)
# Dataset creation
dataset = DatasetBuilder(client).initialize(
account_id="account_id_placeholder",
name=f"translation Dataset {uuid4()}",
test_cases=test_cases
)
print(dataset)
# Define external application
def my_translation_app(prompt, test_case):
print(prompt['origin_text'][:50])
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_translation": "Sample Translation HERE"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT TEXT"
},
"operation_output": {
"formatted_document": "EXAMPLE INPUT TEXT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
# Initialize application
app = ExternalApplication(client)
app.initialize(application_variant_id="variant_id_placeholder", application=my_translation_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
# Evaluation setup
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Does the translation have punctuation issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "categorical",
"title": "Test Question 2",
"prompt": "Does the translation have grammatical issues",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
"account_id": "account_id_placeholder",
},
{
"type": "free_text",
"title": "Test Question 3",
"prompt": "List all translation issues",
"account_id": "account_id_placeholder",
}
]
question_ids = []
for question in question_requests:
q = client.questions.create(**question)
question_ids.append(q.id)
print(q)
q_set = client.question_sets.create(
name="translation question set",
question_ids=question_ids,
account_id="account_id_placeholder"
)
print(q_set)
config = client.evaluation_configs.create(
account_id="account_id_placeholder",
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
annotation_config_dict = TranslationAnnotationConfigParam(
original_text_loc=data_locator.test_case_data.input["origin_text"],
translation_loc=data_locator.test_case_output.output["generated_translation"],
expected_translation_loc=data_locator.test_case_data.expected_output["expected_translation"],
)
evaluation = client.evaluations.create(
account_id="account_id_placeholder",
application_variant_id="variant_id_placeholder",
application_spec_id="spec_id_placeholder",
description="Demo Evaluation",
name="Translation Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)
EvaluationDataset(
id='32f3862e-75e1-4b69-ab08-638ae6ae3829',
account_id='f4b2a52e-29ff-4225-961e-378e23e67524',
created_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
created_by_user_id='6f655fda-0492-494b-bc1d-8d02bcb42c89',
name='translation Dataset 2024-10-17 20:29:30 3926b308-d14b-41c8-a53f-7511fb906d13',
schema_type='FLEXIBLE',
updated_at=datetime.datetime(2024, 10, 18, 0, 29, 30, 684934),
archived_at=None,
evaluation_dataset_metadata=None,
knowledge_base_id=None,
out_of_date=None,
schema_sub_type=None,
vendor=None
)