> ## Documentation Index
> Fetch the complete documentation index at: https://docs.gp.scale.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Multiturn Evaluation

> Create and evaluate a multiturn application

<AccordionGroup>
  <Accordion title="1. Instantiate Clients">
    Set up the SGP Client for communication with the API:

    ```py theme={null}
    from scale_gp import SGPClient

    client = SGPClient(environment="production-multitenant")
    ```
  </Accordion>

  <Accordion title="2. Create the Dataset">
    Build and initialize a multiturn dataset with predefined messages:

    ```py theme={null}
    message_data = [
        {
            "init_messages": [{"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"}],
        },
        {
            "init_messages": [{"role": "user", "content": "How did Napoleon Bonaparte's rise to power impact French society and politics in the early 19th century?"}],
        },
        {
            "init_messages": [{"role": "user", "content": "Analyze the economic and social consequences of French colonialism in North Africa during the 19th and early 20th centuries."}],
        },
    ]

    test_cases = [MultiturnTestCaseSchema(messages=data["init_messages"]) for data in message_data]

    dataset = DatasetBuilder(client).initialize(
        account_id=os.environ["SGP_ACCOUNT_ID"],
        name=f"Multiturn Dataset {timestamp()}",
        test_cases=test_cases
    )
    print(dataset)
    ```
  </Accordion>

  <Accordion title="3. Define a Multiturn Application">
    Define the multiturn application with a simulated conversation output and initialize it:

    ```py theme={null}
    conversation_data = [
        {
            "conversation": [
                {"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"},
                {"role": "assistant", "content": "The French Revolution of 1789 was the result of a complex interplay of social, economic, and political factors. Some key elements include...\n"}
            ],
        },
        # More conversation data entries...
    ]

    def my_multiturn_app(prompt, test_case):
        output = None
        for c in conversation_data:
            if c["conversation"][0] == prompt['messages'][0]:
                output = c["conversation"]
                break

        start = datetime.now().replace(microsecond=5000)
        traces = []
        global_turn_counter = 1
        for i in range(1, len(output), 2):
            prev_convo = output[:i]
            assistant_message = output[i]["content"]

            trace = {
                "node_id": f"Model Call #{global_turn_counter}",
                "start_timestamp": str(start.isoformat()),
                "operation_input": {"conversation_history": prev_convo},
                "operation_output": {"response": assistant_message},
                "duration_ms": random.randint(200, 600),
            }
            traces.append(trace)
            global_turn_counter += 1

        return ExternalApplicationOutputFlexible(
            generation_output={"generated_conversation": output},
            trace_spans=traces,
            metrics={"grammar": round(random.random(), 3), "memory": round(random.random(), 3), "content": round(random.random(), 3)}
        )

    app = ExternalApplication(client)
    app.initialize(application_variant_id=variant.id, application=my_multiturn_app)
    app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')
    ```
  </Accordion>

  <Accordion title="4. Create Questions and Question Set">
    Prepare questions for evaluating the multiturn conversation:

    ```py theme={null}
    question_requests = [
        {
            "type": "categorical",
            "title": "Question 1",
            "prompt": "Does the conversation make sense",
            "choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
            "account_id": os.environ["SGP_ACCOUNT_ID"],
        },
        {
            "type": "categorical",
            "title": "Question 2",
            "prompt": "Is the user query answered correctly?",
            "choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
            "account_id": os.environ["SGP_ACCOUNT_ID"],
        },
        {
            "type": "free_text",
            "title": "Question 3",
            "prompt": "If any messages are incorrect, list their turn number",
            "account_id": os.environ["SGP_ACCOUNT_ID"],
        },
    ]

    question_ids = [client.questions.create(**question).id for question in question_requests]

    q_set = client.question_sets.create(
        name="multiturn question set",
        question_ids=question_ids,
        account_id=os.environ["SGP_ACCOUNT_ID"],
    )
    print(q_set)
    ```
  </Accordion>

  <Accordion title="5. Set up Evaluation Configuration and Start Evaluation">
    Configure and initiate the evaluation process:

    ```py theme={null}
    config = client.evaluation_configs.create(
        account_id=os.environ["SGP_ACCOUNT_ID"],
        question_set_id=q_set.id,
        evaluation_type='human',
    )
    print(config)

    annotation_config_dict = {
        "messages_loc": data_locator.test_case_output.output["generated_conversation"]
    }

    evaluation = client.evaluations.create(
        account_id=os.environ["SGP_ACCOUNT_ID"],
        application_variant_id=variant.id,
        application_spec_id=spec.id,
        description="Demo Multiturn Evaluation",
        name="Multiturn Evaluation",
        evaluation_config_id=config.id,
        annotation_config=annotation_config_dict,
        evaluation_dataset_id=dataset.id,
        type="builder"
    )
    print(evaluation)
    ```
  </Accordion>
</AccordionGroup>

<RequestExample>
  ```python theme={null}
  import os
  import random
  from datetime import datetime

  from scale_gp import SGPClient
  from scale_gp.lib.dataset_builder import DatasetBuilder
  from scale_gp.lib.external_applications import ExternalApplication, ExternalApplicationOutputFlexible
  from scale_gp.lib.types import data_locator
  from scale_gp.lib.types.multiturn import MultiturnTestCaseSchema

  client = SGPClient(environment="production-multitenant")

  message_data = [
      {
          "init_messages": [{"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"}],
      },
      {
          "init_messages": [{"role": "user", "content": "How did Napoleon Bonaparte's rise to power impact French society and politics in the early 19th century?"}],
      },
      {
          "init_messages": [{"role": "user", "content": "Analyze the economic and social consequences of French colonialism in North Africa during the 19th and early 20th centuries."}],
      },
  ]

  test_cases = [MultiturnTestCaseSchema(messages=data["init_messages"]) for data in message_data * 10]

  dataset = DatasetBuilder(client).initialize(
      account_id=os.environ["SGP_ACCOUNT_ID"],
      name=f"Multiturn Dataset {timestamp()}",
      test_cases=test_cases
  )
  print(dataset)

  conversation_data = [
      {
          "conversation": [
              {"role": "user", "content": "What were the key factors that led to the French Revolution of 1789?"},
              {"role": "assistant", "content": "The French Revolution of 1789 was the result of a complex interplay of social, economic, and political factors. Some key elements include...\n"}
          ],
      },
      # More conversation data entries...
  ]

  def my_multiturn_app(prompt, test_case):
      output = None
      for c in conversation_data:
          if c["conversation"][0] == prompt['messages'][0]:
              output = c["conversation"]
              break

      start = datetime.now().replace(microsecond=5000)
      traces = []
      global_turn_counter = 1
      for i in range(1, len(output), 2):
          prev_convo = output[:i]
          assistant_message = output[i]["content"]

          trace = {
              "node_id": f"Model Call #{global_turn_counter}",
              "start_timestamp": str(start.isoformat()),
              "operation_input": {"conversation_history": prev_convo},
              "operation_output": {"response": assistant_message},
              "duration_ms": random.randint(200, 600),
          }
          traces.append(trace)
          global_turn_counter += 1

      return ExternalApplicationOutputFlexible(
          generation_output={"generated_conversation": output},
          trace_spans=traces,
          metrics={"grammar": round(random.random(), 3), "memory": round(random.random(), 3), "content": round(random.random(), 3)}
      )

  app = ExternalApplication(client)
  app.initialize(application_variant_id=variant.id, application=my_multiturn_app)
  app.generate_outputs(evaluation_dataset_id=dataset.id, evaluation_dataset_version='1')

  question_requests = [
      {
          "type": "categorical",
          "title": "Question 1",
          "prompt": "Does the conversation make sense",
          "choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
          "account_id": os.environ["SGP_ACCOUNT_ID"],
      },
      {
          "type": "categorical",
          "title": "Question 2",
          "prompt": "Is the user query answered correctly?",
          "choices": [{"label": "No", "value": 0}, {"label": "Yes", "value": 1}],
          "account_id": os.environ["SGP_ACCOUNT_ID"],
      },
      {
          "type": "free_text",
          "title": "Question 3",
          "prompt": "If any messages are incorrect, list their turn number",
          "account_id": os.environ["SGP_ACCOUNT_ID"],
      },
  ]

  question_ids = [client.questions.create(**question).id for question in question_requests]

  q_set = client.question_sets.create(
      name="multiturn question set",
      question_ids=question_ids,
      account_id=os.environ["SGP_ACCOUNT_ID"],
  )
  print(q_set)

  config = client.evaluation_configs.create(
      account_id=os.environ["SGP_ACCOUNT_ID"],
      question_set_id=q_set.id,
      evaluation_type='human',
  )
  print(config)

  annotation_config_dict = {
      "messages_loc": data_locator.test_case_output.output["generated_conversation"]
  }

  evaluation = client.evaluations.create(
      account_id=os.environ["SGP_ACCOUNT_ID"],
      application_variant_id=variant.id,
      application_spec_id=spec.id,
      description="Demo Multiturn Evaluation",
      name="Multiturn Evaluation",
      evaluation_config_id=config.id,
      annotation_config=annotation_config_dict,
      evaluation_dataset_id=dataset.id,
      type="builder"
  )

  print(evaluation)
  ```
</RequestExample>

<ResponseExample>
  ```python Dataset Creation Response theme={null}
  EvaluationDataset(
      id='13efee60-daa3-4906-9ad9-10ebf37bf2a3',
      account_id='f8e9b881-da9e-4631-bc8d-df831c5a4e4b',
      created_at=datetime.datetime(2024, 9, 23, 23, 2, 58, 750637),
      created_by_user_id='2b2d9b5c-9e2e-496b-b1f4-bf17ce6e1cfc',
      name='Multiturn Dataset 2024-09-23 19:41:21 b9a8c3d3-1f3c-4dc1-b8a6-caf1e6a9f0cb',
      schema_type='FLEXIBLE',
      updated_at=datetime.datetime(2024, 9, 23, 23, 2, 58, 750637),
      archived_at=None,
      evaluation_dataset_metadata=None,
      knowledge_base_id=None,
      out_of_date=None,
      vendor=None
  )
  ```

  ```python Question Set Response theme={null}
  QuestionSet(
      id='21cccdd4-e5ea-4ed6-a26a-09ec40c5b57a',
      account_id='f8e9b881-da9e-4631-bc8d-df831c5a4e4b',
      created_at=datetime.datetime(2024, 9, 23, 23, 3, 2, 710102),
      created_by_user_id='2b2d9b5c-9e2e-496b-b1f4-bf17ce6e1cfc',
      name='multiturn question set',
      instructions=None
  )
  ```

  ```python Evaluation Configuration Response theme={null}
  EvaluationConfig(
      id='8722ecfc-ecac-47bd-ab0a-2ffc6180aba1',
      account_id='f8e9b881-da9e-4631-bc8d-df831c5a4e4b',
      created_at=datetime.datetime(2024, 9, 23, 23, 3, 2, 910061),
      created_by_user_id='2b2d9b5c-9e2e-496b-b1f4-bf17ce6e1cfc',
      evaluation_type='human',
      question_set_id='21cccdd4-e5ea-4ed6-a26a-09ec40c5b57a',
      studio_project_id=None
  )
  ```

  ```python Evaluation Initiation Response theme={null}
  Evaluation(
      id='547d8def-a29d-40b6-85e2-9b052fa5b7c1',
      account_id='f8e9b881-da9e-4631-bc8d-df831c5a4e4b',
      application_spec_id='e2adde70-1ff4-49c6-a823-b2669160a5f3',
      completed_test_case_result_count=0,
      created_at=datetime.datetime(2024, 9, 23, 23, 3, 3, 196172),
      created_by_user_id='2b2d9b5c-9e2e-496b-b1f4-bf17ce6e1cfc',
      description='Demo Multiturn Evaluation',
      name='Multiturn Evaluation',
      status='PENDING',
      total_test_case_result_count=0,
      annotation_config=AnnotationConfig(
          components=[[Component(data_loc=['test_case_output', 'output', 'generated_conversation'], label='Conversation')]],
          annotation_config_type='multiturn',
          direction='col'
      ),
      application_variant_id='34466bba-2aee-4c2b-8ae2-d7da1f30f304',
      archived_at=None,
      completed_at=None,
      evaluation_config=None,
      evaluation_config_id='8722ecfc-ecac-47bd-ab0a-2ffc6180aba1',
      question_id_to_annotation_config=None,
      tags=None
  )
  ```
</ResponseExample>
