Generate an Application Report Card
Generate a report card for an application variant
https://pypi.org/project/scale-egp/
# Prerequisite: pip install -U scale-egp
import os
import time
from typing import List
from scale_gp import SGPClient
from scale_gp.types.application_edge_param import ApplicationEdgeParam
from scale_gp.types.application_node_param import ApplicationNodeParam
from scale_gp.types.model_template_create_params import (
VendorConfiguration,
VendorConfigurationBundleConfig,
VendorConfigurationEndpointConfig,
)
from scale_gp.types.application_configuration_param import ApplicationConfigurationParam
from scale_gp.types.evaluation_datasets.test_case_batch_params import Item
from scale_gp.types.evaluation_datasets.test_case_create_params import TestCaseData
Fetch your API Key from: https://gp.scale.com/admin/api-key
Fetch your Account ID from: https://gp.scale.com/admin/accounts
All resources you interact with using this client will belong to this account.
Note: If you are using your own VPC-deployed version of Scale GP, you will have a different endpoint_url. For users of our multi-tenant platform, use https://gp.scale.com
client = SGPClient(api_key=api_key)
First, follow the instructions in Create Completion Application to create a sample application variant.
model_deployment = client.models.deployments.create(
model_instance_id=model_instance.id, name="Gemini-Pro Deployment", account_id=account_id
)
builder = ApplicationBuilder(client)
builder.create_completion_application(account_id, "test-app-builder", model_deployment.id)
variant_id = builder.application.variant_id
print("Created application variant with id: ", variant_id)
Follow the manual evaluation dataset recipe to create a sample evaluation dataset
manual_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="manual_evaluation_dataset",
schema_type="GENERATION",
type="manual",
)
print("Created manual dataset: ", manual_evaluation_dataset)
DATASET: List[TestCaseData]= [
{
"input": "What is the capital of France?",
"expected_output": "The capital of France is Paris.",
"expected_extra_info": {
"info": "Paris, located in the northern part of France along the Seine River, is not only the country’s capital but also its largest city. Known for its art, fashion, gastronomy, and culture, Paris has a significant influence both in France and globally.",
"schema_type": "STRING",
},
},
{
"input": "What color is an apple?",
"expected_output": "An apple is typically red when ripe, though green and yellow varieties also exist depending on the species and maturity.",
"expected_extra_info": {
"info": "Apples come in various colors including red, green, and yellow. Red apples are one of the most common, with varieties like Red Delicious being very popular. Green apples, like Granny Smith, are tart and used often in baking. Yellow apples such as Golden Delicious are sweet and softer.",
"schema_type": "STRING",
},
},
{
"input": "Who is the first president of the USA?",
"expected_output": "The first president of the USA is George Washington.",
"expected_extra_info": {
"info": "George Washington served as the first president of the United States from 1789 to 1797. He is a pivotal figure in American history, recognized for his leadership during the Revolutionary War and setting many precedents for the national government.",
"schema_type": "STRING",
},
},
]
items: List[Item] = [Item({"account_id": account_id, "test_case_data": item}) for item in DATASET]
test_cases: List[TestCase] = []
uploaded_test_cases = client.evaluation_datasets.test_cases.batch(
evaluation_dataset_id=manual_evaluation_dataset.id,
items=items,
)
test_cases.extend(uploaded_test_cases)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=manual_evaluation_dataset.id,
)
Create an evaluation
evaluation = client.evaluations.create(
type="builder",
account_id=account_id,
application_spec_id=spec_id,
application_variant_id=variant_id,
description="description",
evaluation_dataset_id=manual_evaluation_dataset.id,
name="Report card evaluation",
evaluation_config_id=evaluation_config.id,
)
A report card will provide a summary of the performance of the application variant against an evaluation dataset Three main scores will be produced: Accuracy, Quality and Trust & Safety. The performance in each category will contribute to an overall Scale Confidence Score for the variant.
You can view generated report cards in the UI at: https://egp.dashboard.scale.com/applications/{application_spec_id}/{application_variant_id}/report-card/overview
create_response = client.application_variant_reports.create(
application_variant_id=variant_id,
evaluation_dataset_ids=[manual_evaluation_dataset.id],
account_id=account_id,
)
# retrieve the application variant report, will still be PENDING if immediately retrieved
application_variant_report = client.application_variant_reports.retrieve(
application_variant_report_id=create_response.id,
view=["AsyncJobs"],
)
print("Generated application variant report:")
print(application_variant_report)