Skip to main content

Agentex Output

Use the agentex_output task type to generate outputs from an external Agentex agent for each item in your evaluation.

Example Usage

The following illustrates a complete example in which an Agentex agent is used to generate responses for each evaluation item.
from scale_gp_beta import SGPClient
import time

# Initialize the client
client = SGPClient(
    account_id="your-account-id",
    api_key="your-api-key"
)

# Create an evaluation with Agentex output task
evaluation = client.evaluations.create(
    name="My Agent Evaluation",
    data=[
        {"input": "What is the capital of France?"},
        {"input": "Explain quantum computing in simple terms."},
    ],
    tasks=[
        {
            "task_type": "agentex_output",
            "alias": "agent_response",
            "configuration": {
                "agentex_agent_id": "your-agent-id",
                "input_column": "item.input",
                "include_traces": True
            }
        }
    ]
)

print(f"Created evaluation: {evaluation.id}")

# Poll until complete
while True:
    eval_status = client.evaluations.retrieve(evaluation_id=evaluation.id)
    print(f"Status: {eval_status.status}")
    
    if eval_status.status == "completed":
        items = client.evaluation_items.list(evaluation_id=evaluation.id)
        
        for item in items:
            print(f"\nInput: {item.data['input']}")
            print(f"Output: {item.data['agent_response']['output']}")
            if item.data['agent_response'].get('trace'):
                print(f"Trace spans: {len(item.data['agent_response']['trace'])}")
        break
    
    time.sleep(5)
For Azure workspace deployments, use the /api path suffix in the base URL:
from scale_gp_beta import SGPClient

client = SGPClient(
    account_id="your-account-id",
    api_key="your-api-key",
    base_url="https://your-workspace.azure.workspace.egp.scale.com/api"
)

Output Structure

The task produces the following output for each evaluation item: Successful response:
{
  "output": "The agent's final text response",
  "trace": [...]  // Only if include_traces=True - list of execution spans
}
Error response:
{
  "output": {
    "error": "Error message",
    "error_type": "timeout | null_input | extraction_error | connection_error"
  }
}

Combining with Other Tasks

Agentex outputs are generated before other evaluation tasks run. This allows you to chain the agent output with other tasks like auto evaluations or contributor evaluations.
evaluation = client.evaluations.create(
    name="Agentex with Judge Evaluation",
    data=[
        {
            "input": "What is the capital of France?",
            "expected_output": "Paris"
        },
        ...
    ],
    tasks=[
        # First, generate output from Agentex agent
        {
            "task_type": "agentex_output",
            "alias": "agent_response",
            "configuration": {
                "agentex_agent_id": "your-agent-id",
                "input_column": "item.input",
                "include_traces": False
            }
        },
        # Then, evaluate the output with an LLM judge
        {
            "task_type": "auto_evaluation.guided_decoding",
            "alias": "correctness",
            "configuration": {
                "model": "openai/gpt-4o",
                "prompt": """
                    Given the user's query: {{item.input}},
                    The agent's response was: {{item.agent_response.output}}
                    The expected response is: {{item.expected_output}}
                    Did the agent's response correctly answer the question?
                """,
                "choices": ["Yes", "No"]
            }
        }
    ]
)