> ## Documentation Index
> Fetch the complete documentation index at: https://docs.gp.scale.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Create Parse Job

> Create a parse job that will asynchronously process a document.

This endpoint initiates document parsing using the specified engine (default: Reducto).
The operation is performed asynchronously via Temporal workflows.

Args:
    request: Parse job request with source document ID and parameters

Returns:
    Job entity that can be used to track progress

Raises:
    HTTPException: If the source document is not found or other errors occur


## OpenAPI

````yaml https://dex.sgp.scale.com/openapi.json post /v1/projects/{project_id}/parse
openapi: 3.1.0
info:
  title: Document Understanding API
  description: API for uploading and processing documents
  version: 0.4.5
servers: []
security:
  - ApiKey: []
    AccountId: []
tags:
  - name: Projects
    description: Operations related to project creation and management
  - name: Files
    description: Operations related to file upload and access
  - name: Parse
    description: Operations related to starting parse jobs and accessing their results
  - name: Vector Stores
    description: Operations related to vector store creation and management
  - name: Extract
    description: Operations related to starting extract jobs and accessing their results
  - name: Research
    description: Dex Research agent kickoff and results.
  - name: Jobs
    description: Operations related to monitoring jobs and their status
paths:
  /v1/projects/{project_id}/parse:
    post:
      tags:
        - Parse
      summary: Create Parse Job
      description: >-
        Create a parse job that will asynchronously process a document.


        This endpoint initiates document parsing using the specified engine
        (default: Reducto).

        The operation is performed asynchronously via Temporal workflows.


        Args:
            request: Parse job request with source document ID and parameters

        Returns:
            Job entity that can be used to track progress

        Raises:
            HTTPException: If the source document is not found or other errors occur
      operationId: create_parse_job_v1_projects__project_id__parse_post
      parameters:
        - name: project_id
          in: path
          required: true
          schema:
            type: string
            title: Project Id
        - name: beta_enable_iris2
          in: query
          required: false
          schema:
            type: boolean
            description: Enable Iris2 engine for beta testing
            default: true
            title: Beta Enable Iris2
          description: Enable Iris2 engine for beta testing
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/ParseJobRequest'
      responses:
        '202':
          description: Successful Response
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/JobEntity'
        '422':
          description: Validation Error
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/HTTPValidationError'
components:
  schemas:
    ParseJobRequest:
      properties:
        source_document_id:
          type: string
          title: Source Document Id
          description: The ID of the file to be processed
        parameters:
          oneOf:
            - $ref: '#/components/schemas/ReductoParseJobParams'
            - $ref: '#/components/schemas/IrisParseJobParams'
          title: Parameters
          description: Parse parameters (engine, options, etc.)
          discriminator:
            propertyName: engine
            mapping:
              iris:
                $ref: '#/components/schemas/IrisParseJobParams'
              reducto:
                $ref: '#/components/schemas/ReductoParseJobParams'
      type: object
      required:
        - source_document_id
        - parameters
      title: ParseJobRequest
      description: Request model for creating a parse job.
    JobEntity:
      properties:
        id:
          type: string
          title: Id
          description: ID of the entity
        project_id:
          type: string
          title: Project Id
          description: ID of the project
        object:
          type: string
          const: job
          title: Object
          default: job
        operation:
          $ref: '#/components/schemas/JobOperationType'
          description: Operation type (e.g., 'parse')
        status:
          $ref: '#/components/schemas/JobStatus'
          description: Current job status
        source_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Source Id
          description: Source document/file ID
        correlation_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Correlation Id
          description: Request correlation ID for tracing
        created_at:
          type: string
          format: date-time
          title: Created At
          description: When the job was created
        started_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Started At
          description: When the job started processing
        completed_at:
          anyOf:
            - type: string
              format: date-time
            - type: 'null'
          title: Completed At
          description: When the job completed
        result:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Result
          description: Job result payload when completed
        progress:
          anyOf:
            - $ref: '#/components/schemas/BatchParseProgress'
            - type: 'null'
          description: Live progress payload (used by batch jobs)
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: Error message if job failed
        history:
          anyOf:
            - items:
                $ref: '#/components/schemas/JobHistoryEvent'
              type: array
            - type: 'null'
          title: History
          description: Timeline of job execution events
      type: object
      required:
        - id
        - project_id
        - operation
        - status
        - created_at
      title: JobEntity
      description: Job response model representing an asynchronous operation.
    HTTPValidationError:
      properties:
        detail:
          items:
            $ref: '#/components/schemas/ValidationError'
          type: array
          title: Detail
      type: object
      title: HTTPValidationError
    ReductoParseJobParams:
      properties:
        engine:
          type: string
          const: reducto
          title: Engine
          description: Engine
          default: reducto
        chunking_options:
          anyOf:
            - oneOf:
                - $ref: '#/components/schemas/TokenSizeChunkingOptions'
                - $ref: '#/components/schemas/RecursiveChunkingOptions'
                - $ref: '#/components/schemas/PageChunkingOptions'
                - $ref: '#/components/schemas/SectionChunkingOptions'
              discriminator:
                propertyName: strategy
                mapping:
                  by_page:
                    $ref: '#/components/schemas/PageChunkingOptions'
                  by_section:
                    $ref: '#/components/schemas/SectionChunkingOptions'
                  recursive:
                    $ref: '#/components/schemas/RecursiveChunkingOptions'
                  token_size:
                    $ref: '#/components/schemas/TokenSizeChunkingOptions'
            - type: 'null'
          title: Chunking Options
          description: Chunking options
        vector_store_metadata:
          anyOf:
            - additionalProperties:
                anyOf:
                  - type: string
                  - type: integer
                  - type: number
                  - type: boolean
              type: object
            - type: 'null'
          title: Vector Store Metadata
          description: >-
            Metadata to populate into the vector store to filter on when
            searching
        options:
          $ref: '#/components/schemas/ReductoParseEngineOptions'
          description: Options
        advanced_options:
          additionalProperties: true
          type: object
          title: Advanced Options
          description: Advanced options
          default: {}
        experimental_options:
          additionalProperties: true
          type: object
          title: Experimental Options
          description: Experimental options
          default: {}
        priority:
          type: boolean
          title: Priority
          description: Priority
          default: false
      type: object
      required:
        - options
      title: ReductoParseJobParams
      description: Parameters for creating a parse job.
    IrisParseJobParams:
      properties:
        engine:
          type: string
          const: iris
          title: Engine
          description: Engine
          default: iris
        chunking_options:
          anyOf:
            - oneOf:
                - $ref: '#/components/schemas/TokenSizeChunkingOptions'
                - $ref: '#/components/schemas/RecursiveChunkingOptions'
                - $ref: '#/components/schemas/PageChunkingOptions'
                - $ref: '#/components/schemas/SectionChunkingOptions'
              discriminator:
                propertyName: strategy
                mapping:
                  by_page:
                    $ref: '#/components/schemas/PageChunkingOptions'
                  by_section:
                    $ref: '#/components/schemas/SectionChunkingOptions'
                  recursive:
                    $ref: '#/components/schemas/RecursiveChunkingOptions'
                  token_size:
                    $ref: '#/components/schemas/TokenSizeChunkingOptions'
            - type: 'null'
          title: Chunking Options
          description: Chunking options
        vector_store_metadata:
          anyOf:
            - additionalProperties:
                anyOf:
                  - type: string
                  - type: integer
                  - type: number
                  - type: boolean
              type: object
            - type: 'null'
          title: Vector Store Metadata
          description: >-
            Metadata to populate into the vector store to filter on when
            searching
        options:
          $ref: '#/components/schemas/IrisParseEngineOptions'
          description: Options
      type: object
      required:
        - options
      title: IrisParseJobParams
    JobOperationType:
      type: string
      enum:
        - parse
        - batch_parse
        - extract
        - research
        - vector_store
        - chunk
        - summarization
        - create_index
        - update_index
      title: JobOperationType
      description: Enum for job operation values.
    JobStatus:
      type: string
      enum:
        - pending
        - running
        - succeeded
        - partially_succeeded
        - failed
        - cancelled
      title: JobStatus
      description: Enum for job status values.
    BatchParseProgress:
      properties:
        total:
          type: integer
          title: Total
          description: Total number of files in the batch
        succeeded:
          type: integer
          title: Succeeded
          description: Number of successfully parsed files
        failed:
          type: integer
          title: Failed
          description: Number of files that failed to parse
        cancelled:
          type: integer
          title: Cancelled
          description: Number of cancelled child jobs
        pending:
          type: integer
          title: Pending
          description: Number of files still pending (0 when job is terminal)
        child_jobs:
          items:
            $ref: '#/components/schemas/BatchChildJobInfo'
          type: array
          title: Child Jobs
          description: Per-child job status
      type: object
      required:
        - total
        - succeeded
        - failed
        - cancelled
        - pending
        - child_jobs
      title: BatchParseProgress
      description: Live progress tracking for a batch parse job.
    JobHistoryEvent:
      properties:
        step:
          type: string
          title: Step
          description: Human-readable step name
        timestamp:
          type: string
          format: date-time
          title: Timestamp
          description: When this event occurred
        duration_ms:
          anyOf:
            - type: integer
            - type: 'null'
          title: Duration Ms
          description: Duration in milliseconds (for completed steps)
        status:
          anyOf:
            - type: string
            - type: 'null'
          title: Status
          description: Event status (e.g., 'completed', 'failed')
        details:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Details
          description: Additional event details
      type: object
      required:
        - step
        - timestamp
      title: JobHistoryEvent
      description: A single event in the job execution timeline.
    ValidationError:
      properties:
        loc:
          items:
            anyOf:
              - type: string
              - type: integer
          type: array
          title: Location
        msg:
          type: string
          title: Message
        type:
          type: string
          title: Error Type
        input:
          title: Input
        ctx:
          type: object
          title: Context
      type: object
      required:
        - loc
        - msg
        - type
      title: ValidationError
    TokenSizeChunkingOptions:
      properties:
        strategy:
          type: string
          const: token_size
          title: Strategy
          description: The chunking strategy
          default: token_size
        chunk_size:
          type: integer
          exclusiveMinimum: 0
          title: Chunk Size
          description: Target size of each chunk in tokens
          default: 512
        chunk_overlap:
          type: integer
          minimum: 0
          title: Chunk Overlap
          description: Number of overlapping tokens between chunks
          default: 50
        encoding_name:
          type: string
          title: Encoding Name
          description: Tiktoken encoding name (e.g., cl100k_base for GPT-4)
          default: cl100k_base
      type: object
      title: TokenSizeChunkingOptions
      description: >-
        Token-based chunking: Splits text into chunks by token count using a
        tokenizer (e.g., tiktoken). Best for LLM APIs with token limits,
        embedding models, and cost optimization. Use when you need precise
        control over token usage.
    RecursiveChunkingOptions:
      properties:
        strategy:
          type: string
          const: recursive
          title: Strategy
          description: The chunking strategy
          default: recursive
        chunk_size:
          type: integer
          exclusiveMinimum: 0
          title: Chunk Size
          description: Target size of each chunk in characters
          default: 1000
        chunk_overlap:
          type: integer
          minimum: 0
          title: Chunk Overlap
          description: Number of overlapping characters between chunks
          default: 200
        separators:
          items:
            type: string
          type: array
          title: Separators
          description: List of separators to try in order
          default:
            - |+


            - |+

            - ' '
            - ''
        keep_separator:
          type: boolean
          title: Keep Separator
          description: Whether to keep the separator in the chunks
          default: true
      type: object
      title: RecursiveChunkingOptions
      description: >-
        Recursive text splitting: Uses a hierarchy of separators (paragraphs →
        sentences → words) to preserve natural text boundaries. Best for
        articles, documentation, and RAG systems where readability and semantic
        coherence matter.
    PageChunkingOptions:
      properties:
        strategy:
          type: string
          const: by_page
          title: Strategy
          description: The chunking strategy
          default: by_page
        pages_per_chunk:
          type: integer
          exclusiveMinimum: 0
          title: Pages Per Chunk
          description: Number of pages to include in each chunk
          default: 1
      type: object
      title: PageChunkingOptions
      description: >-
        Page-based chunking: Splits documents by page boundaries, grouping
        complete pages together. Best for legal documents, forms, and reports
        where page references are important and page structure should be
        preserved.
    SectionChunkingOptions:
      properties:
        strategy:
          type: string
          const: by_section
          title: Strategy
          description: The chunking strategy
          default: by_section
        section_headers:
          anyOf:
            - items:
                type: string
              type: array
            - type: 'null'
          title: Section Headers
          description: Markdown-style headers that denote sections
        include_header_in_chunk:
          type: boolean
          title: Include Header In Chunk
          description: Whether to include the section header in the chunk
          default: true
      type: object
      title: SectionChunkingOptions
      description: >-
        Section-based chunking: Splits text by section headers (e.g., markdown
        #, ##, ###) to keep complete topics together. Best for structured
        documents like technical manuals, wikis, and academic papers where
        semantic coherence within topics is crucial.
    ReductoParseEngineOptions:
      properties:
        chunking:
          anyOf:
            - $ref: '#/components/schemas/ReductoChunkingOptions'
            - type: 'null'
          description: Chunking options
      additionalProperties: true
      type: object
      title: ReductoParseEngineOptions
      description: Options for the Reducto parse engine.
    IrisParseEngineOptions:
      properties:
        layout:
          anyOf:
            - type: string
              enum:
                - rt_detr_bce
                - whole_page
            - type: 'null'
          title: Layout
          description: Layout detection model to use
        text_ocr:
          anyOf:
            - type: string
            - type: 'null'
          title: Text Ocr
          description: Text OCR model to use
        table_ocr:
          anyOf:
            - type: string
            - type: 'null'
          title: Table Ocr
          description: Table OCR model to use
        model_parameters:
          anyOf:
            - additionalProperties: true
              type: object
            - type: 'null'
          title: Model Parameters
          description: >-
            Extra parameters passed to LLM inference calls (e.g. max_tokens,
            temperature). Merged into the request payload; user-supplied values
            override defaults.
        text_prompt:
          anyOf:
            - type: string
            - type: 'null'
          title: Text Prompt
          description: Custom prompt for text extraction models (only used by VLMs)
        table_prompt:
          anyOf:
            - type: string
            - type: 'null'
          title: Table Prompt
          description: Custom prompt for table extraction models (only used by VLMs)
        left_to_right:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Left To Right
          description: >-
            Sort regions left-to-right instead of right-to-left when doing
            Markdown assembly (default: False)
        confidence_threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Confidence Threshold
          description: Minimum confidence threshold for layout detection boxes
        containment_threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Containment Threshold
          description: >-
            Containment threshold for filtering. If smaller box is X% contained
            in larger box, drop it.
        img_method:
          anyOf:
            - type: string
              enum:
                - base64
                - description
                - skip
            - type: 'null'
          title: Img Method
          description: >-
            Image embedding method: description (LLM-generated), base64
            (self-contained), skip (ignore images)
        text_system_prompt:
          anyOf:
            - type: string
            - type: 'null'
          title: Text System Prompt
          description: Custom system prompt for text extraction models (only used by VLMs)
        table_confidence_threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Table Confidence Threshold
          description: >-
            Minimum confidence threshold for layout detection boxes that are
            labelled as tables
        image_confidence_threshold:
          anyOf:
            - type: number
            - type: 'null'
          title: Image Confidence Threshold
          description: >-
            Minimum confidence threshold for layout detection boxes that are
            labelled as images
        strict_containment_filter:
          anyOf:
            - type: boolean
            - type: 'null'
          title: Strict Containment Filter
          description: >-
            If True, then filters out all boxes that are contained in a larger
            box, if False, it only filters out boxes of the same type (e.g it
            still extracts text boxes from inside images.)
        img_description_prompt:
          anyOf:
            - type: string
            - type: 'null'
          title: Img Description Prompt
          description: >-
            Custom prompt for image description (only used if img_method is
            'description')
        image_description_model:
          anyOf:
            - type: string
            - type: 'null'
          title: Image Description Model
          description: >-
            LLM model to use for image descriptions (only used if img_method is
            'description'). Examples: 'gpt-4o', 'gemini', 'gpt-4o-mini'
        e2e_ocr:
          anyOf:
            - type: string
            - type: 'null'
          title: E2E Ocr
          description: >-
            End-to-end OCR model that performs layout and content extraction
            simultaneously via the SGP API. When specified, bypasses layout
            detection and sends the full page image to this model. Mutually
            exclusive with layout/text_ocr/table_ocr. Example: 'openai/gpt-4o'
        e2e_response_parser:
          anyOf:
            - type: string
            - type: 'null'
          title: E2E Response Parser
          description: >-
            Response parser for the e2e OCR model. Required when e2e_ocr is set
            so the raw model output can be parsed into per-region bounding
            boxes. Example: 'deepseek_ocr2'
        e2e_prompt:
          anyOf:
            - type: string
            - type: 'null'
          title: E2E Prompt
          description: >-
            Custom prompt for end-to-end OCR model (only used when e2e_ocr is
            set)
      type: object
      title: IrisParseEngineOptions
    BatchChildJobInfo:
      properties:
        source_document_id:
          type: string
          title: Source Document Id
          description: Source document ID
        job_id:
          type: string
          title: Job Id
          description: Child job ID
        status:
          $ref: '#/components/schemas/JobStatus'
          description: Status of the child job
        parse_result_id:
          anyOf:
            - type: string
            - type: 'null'
          title: Parse Result Id
          description: Parse result ID if succeeded
        error:
          anyOf:
            - type: string
            - type: 'null'
          title: Error
          description: Error message if failed
      type: object
      required:
        - source_document_id
        - job_id
        - status
      title: BatchChildJobInfo
      description: Status of a single child job within a batch.
    ReductoChunkingOptions:
      properties:
        chunk_mode:
          $ref: '#/components/schemas/ReductoChunkingMethod'
          description: Chunking method
          default: variable
        chunk_size:
          anyOf:
            - type: integer
            - type: 'null'
          title: Chunk Size
          description: Chunk size
      type: object
      title: ReductoChunkingOptions
    ReductoChunkingMethod:
      type: string
      enum:
        - disabled
        - block
        - page
        - page_sections
        - section
        - variable
      title: ReductoChunkingMethod
      description: Chunking method used for parsing.
  securitySchemes:
    ApiKey:
      type: apiKey
      in: header
      name: x-api-key
      description: API key for authentication
    AccountId:
      type: apiKey
      in: header
      name: x-selected-account-id
      description: Selected Account ID

````