Use Case Evaluations
Summarization Evaluation
Recipes
- Evaluations
- Applications
- Datasets
- Inference
Use Case Evaluations
Summarization Evaluation
Create and evaluate a summarization application
Follow the instructions in the Quickstart Guide to setup the SGP Client
from scale_gp import SGPClient
client = SGPClient(base_url="https://api.egp.scale.com")
Define and upload summarization test cases into the dataset
document_data = [
{
"document": "The Industrial Revolution, which took place from the 18th to 19th centuries, was a period ... technological advancements of this period laid the groundwork for future innovations and economic growth.",
"expected_summary": "The Industrial Revolution was a transformative period from the 18th to 19th centuries, marked ..."
},
{
"document": "Quantum computing is an area of computing focused on developing computer technology ... significant investment and research continue in this potentially revolutionary technology.",
"expected_summary": "Quantum computing is an emerging field that uses quantum mechanics principles to process ..."
},
{
"document": "Climate change refers to long-term shifts in global weather patterns and average temperatures ... addressing climate change will require sustained effort and collaboration at all levels of society.",
"expected_summary": "Climate change is a global phenomenon primarily driven by human activities, especially the ..."
}
]
test_cases = []
for data in document_data * 10:
tc = SummarizationTestCaseSchema(
document=data["document"],
expected_summary=data["expected_summary"]
)
test_cases.append(tc)
print(tc)
Initialize the dataset using the DatasetBuilder
from datetime import datetime
from uuid import uuid4
def timestamp():
return f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} {uuid4()}"
dataset = DatasetBuilder(client).initialize(
account_id="your_account_id",
name=f"Summarization Dataset {timestamp()}",
test_cases=test_cases
)
print(dataset)
Implement and run your summarization application
def my_summarization_app(prompt, test_case):
print(prompt['document'][:50])
start = datetime.now().replace(microsecond=5000)
return ExternalApplicationOutputFlexible(
generation_output={
"generated_summary": "GENERATED OUTPUT SUMMARY"
},
trace_spans=[
{
"node_id": "formatting",
"start_timestamp": str(start.isoformat()),
"operation_input": {
"document": "EXAMPLE INPUT DOCUMENT"
},
"operation_output": {
"formatted_document": "EXAMPLE OUTPUT DOCUMENT FORMATTED"
},
"duration_ms": 1000,
}
],
metrics={"grammar": 0.5}
)
app = ExternalApplication(client)
app.initialize(application_variant_id="your_variant_id", application=my_summarization_app)
app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
Create questions to be asked during evaluation
question_requests = [
{
"type": "categorical",
"title": "Test Question 1",
"prompt": "Test Prompt",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}]
},
{
"type": "categorical",
"title": "Test Question 2",
"prompt": "Was the summary concise?",
"choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}]
},
{
"type": "free_text",
"title": "Test Question 3",
"prompt": "List relevant information the summary cut out"
}
]
question_ids = []
for question in question_requests:
q = client.questions.create(
**question
)
question_ids.append(q.id)
print(q)
Organize questions into a set and create an evaluation configuration
q_set = client.question_sets.create(
name="summarization question set",
question_ids=question_ids
)
print(q_set)
config = client.evaluation_configs.create(
question_set_id=q_set.id,
evaluation_type='human'
)
print(config)
Set up the evaluation configuration for the summarization task
from scale_gp.lib.types import data_locator
from scale_gp.types import SummarizationAnnotationConfigParam
annotation_config_dict = SummarizationAnnotationConfigParam(
document_loc=data_locator.test_case_data.input["document"],
summary_loc=data_locator.test_case_output.output["generated_summary"],
expected_summary_loc=data_locator.test_case_data.expected_output["expected_summary"]
)
evaluation = client.evaluations.create(
application_variant_id="your_variant_id",
application_spec_id="your_spec_id",
description="Demo Evaluation",
name="Summarization Evaluation",
evaluation_config_id=config.id,
annotation_config=annotation_config_dict,
evaluation_dataset_id=dataset.id,
type="builder"
)
print(evaluation)