Create an evaluation dataset and autogenerate test cases based on a list of harms. Harms are a list of negative or undesired topics that the model should not generate or properly handle. Advanced configs for emotions, moods, methods, tones can be provided to generate test cases based on the provided configurations
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)
1. Instantiate Client
Follow the instructions in the Quickstart Guide to setup the SGP Client
Fetch a Knowledge Base ID from: https://egp.dashboard.scale.com/knowledge-bases
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
2. Create safety dataset
For safety evaluation datasets, a generation job workflow is created to generate test cases. You must additionally define a harms list, which include topics that you want to test that your application or model handles properly. Advanced configs for emotions, moods, methods, tones can also be provided
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
3. Start generation job
Start the generation job. This job will generate test cases based on the provided harms list and advanced configs.
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
4. Approve auto-generated test cases
Before publishing the dataset, review the auto-generated test cases and approve/decline each test case. Publishing is blocked until all test cases are reviewed.
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
5. Publish the dataset
Publishing the dataset allows it to be available for use in evaluations
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)
Create an evaluation dataset and autogenerate test cases based on a list of harms. Harms are a list of negative or undesired topics that the model should not generate or properly handle. Advanced configs for emotions, moods, methods, tones can be provided to generate test cases based on the provided configurations
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)
1. Instantiate Client
Follow the instructions in the Quickstart Guide to setup the SGP Client
Fetch a Knowledge Base ID from: https://egp.dashboard.scale.com/knowledge-bases
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
2. Create safety dataset
For safety evaluation datasets, a generation job workflow is created to generate test cases. You must additionally define a harms list, which include topics that you want to test that your application or model handles properly. Advanced configs for emotions, moods, methods, tones can also be provided
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
3. Start generation job
Start the generation job. This job will generate test cases based on the provided harms list and advanced configs.
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
4. Approve auto-generated test cases
Before publishing the dataset, review the auto-generated test cases and approve/decline each test case. Publishing is blocked until all test cases are reviewed.
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
5. Publish the dataset
Publishing the dataset allows it to be available for use in evaluations
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)
import os
import time
from scale_gp import SGPClient
client = SGPClient(api_key=api_key)
safety_evaluation_dataset = client.evaluation_datasets.create(
account_id=account_id,
name="safety_evaluation_dataset_test",
schema_type="GENERATION",
type="safety",
harms_list=["toxicity", "profanity"],
advanced_config={"emotions": ["angry", "sad", "grumpy"]},
)
print(safety_evaluation_dataset)
generation_job = client.evaluation_datasets.generation_jobs.create(
evaluation_dataset_id=safety_evaluation_dataset.id,
num_test_cases=3,
group_by_artifact_id=False,
harms_list=["toxicity", "profanity"],
advanced_config={
"emotions": ["angry", "sad", "grumpy"],
"moods": ["Indicative", "Imperative", "Interrogative"],
"methods": [
"Asking obliquely with obscure slang or niche cultural references",
"Making a leading statement that tries to trigger completion",
"Framing as part of a fictional story",
],
"tones": ["Formal", "Normal", "Drunk Text"],
},
)
while True:
generation_job = client.evaluation_datasets.generation_jobs.retrieve(
generation_job_id=generation_job.generation_job_id,
evaluation_dataset_id=safety_evaluation_dataset.id
)
if generation_job.status == "Pending":
print("generating test cases...")
time.sleep(5)
else:
break
print(generation_job)
# view autogenerated test cases
test_cases = client.evaluation_datasets.autogenerated_draft_test_cases.list(
evaluation_dataset_id=safety_evaluation_dataset.id
)
print(test_cases.itmes)
for test_case in test_cases.items:
client.evaluation_datasets.autogenerated_draft_test_cases.approve(
evaluation_dataset_id=safety_evaluation_dataset.id,
autogenerated_draft_test_case_id=test_case.id,
)
published_dataset_response = client.evaluation_datasets.publish(
evaluation_dataset_id=safety_evaluation_dataset.id,
)
EvaluationDataset(
id='6cd3aa30-57e7-4cb4-9315-d90ed487442d',
account_id='66049ada2fc77c99ef015be7',
created_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
created_by_user_id='42a5c8af-f698-43d0-923e-ba70102a2887',
draft=None,
name='safety_evaluation_dataset_test',
schema_type='GENERATION',
updated_at=datetime.datetime(2024, 9, 26, 20, 10, 57, 183391),
archived_at=None,
evaluation_dataset_metadata={
'harms_list': [
'toxicity',
'profanity'
]
},
knowledge_base_id=None,
out_of_date=None,
vendor=None
)