> ## Documentation Index
> Fetch the complete documentation index at: https://docs.scrapio.dev/llms.txt
> Use this file to discover all available pages before exploring further.

# Crawl

> Crawl a site starting from one or more seed URLs, following links up to a configurable depth.



## OpenAPI

````yaml post /v1/crawl
openapi: 3.1.0
info:
  title: Scrapio
  version: 1.0.0
  description: >-
    Turn any URL into structured data. Fetch raw HTML, crawl sites, extract
    structured content, interact with dynamic pages, and search the web through
    one API.
  contact:
    name: Scrapio Support
    url: https://scrapio.dev
  license:
    name: Proprietary
servers:
  - url: https://api.scrapio.dev
    description: Production
  - url: http://localhost:3000
    description: Local development
security:
  - bearerAuth: []
tags:
  - name: System
  - name: Fetch
  - name: Google
  - name: Crawl
  - name: Map
  - name: Interact
  - name: Search
  - name: Jobs
  - name: YouTube
  - name: Amazon
  - name: Walmart
paths:
  /v1/crawl:
    post:
      tags:
        - Crawl
      summary: Crawl a site from seed URLs
      description: >-
        Breadth-first crawls one or more seed URLs, following same-origin links
        up to `max_pages` and `max_depth`. Each page is fetched inline and
        returned with its own status; the overall response is `partial` if any
        page failed. For large crawls, submit via `POST /v1/jobs` with `kind:
        "crawl"` instead.
      operationId: crawlSite
      requestBody:
        required: true
        content:
          application/json:
            schema:
              $ref: '#/components/schemas/CrawlRequest'
            examples:
              basic:
                summary: Crawl up to 5 pages as Markdown
                value:
                  seeds:
                    - https://example.com
                  max_pages: 5
                  max_depth: 2
                  output:
                    - markdown
      responses:
        '200':
          description: Crawl completed; per-page results and a summary are returned.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/CrawlResponse'
        '400':
          description: Invalid crawl parameters.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '401':
          description: Missing or invalid API key.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
        '429':
          description: Tenant daily credit cap exceeded.
          content:
            application/json:
              schema:
                $ref: '#/components/schemas/ErrorResponse'
      x-codeSamples:
        - lang: cURL
          label: cURL
          source: |-
            curl -X POST https://api.scrapio.dev/v1/crawl \
              -H "Authorization: Bearer $SCRAPIO_API_KEY" \
              -H "Content-Type: application/json" \
              -d '{"seeds":["https://example.com"],"max_pages":5,"output":["markdown"]}'
        - lang: Python
          label: Python
          source: |-
            import os, requests
            resp = requests.post(
                "https://api.scrapio.dev/v1/crawl",
                headers={"Authorization": f"Bearer {os.environ['SCRAPIO_API_KEY']}"},
                json={"seeds": ["https://example.com"], "max_pages": 5, "output": ["markdown"]},
            )
            for page in resp.json()["result"]["pages"]:
                print(page["url"], page["status"])
        - lang: JavaScript
          label: Node.js
          source: |-
            const resp = await fetch("https://api.scrapio.dev/v1/crawl", {
              method: "POST",
              headers: { Authorization: `Bearer ${process.env.SCRAPIO_API_KEY}`, "Content-Type": "application/json" },
              body: JSON.stringify({ seeds: ["https://example.com"], max_pages: 5, output: ["markdown"] }),
            });
            const { result } = await resp.json();
            result.pages.forEach(p => console.log(p.url, p.status));
components:
  schemas:
    CrawlRequest:
      type: object
      properties:
        seeds:
          minItems: 1
          type: array
          items:
            type: string
            format: uri
        max_pages:
          type: integer
          exclusiveMinimum: 0
          maximum: 50
        max_depth:
          type: integer
          minimum: 0
          maximum: 5
        same_domain_only:
          type: boolean
        output:
          minItems: 1
          type: array
          items:
            type: string
            enum:
              - html
              - markdown
              - json
        extract:
          anyOf:
            - type: object
              properties:
                mode:
                  type: string
                  enum:
                    - page
              required:
                - mode
              additionalProperties: false
            - type: object
              properties:
                mode:
                  type: string
                  enum:
                    - selectors
                fields:
                  type: object
                  propertyNames:
                    type: string
                  additionalProperties:
                    type: object
                    properties:
                      selector:
                        type: string
                        minLength: 1
                      type:
                        type: string
                        enum:
                          - text
                          - html
                          - attr
                      attribute:
                        type: string
                        minLength: 1
                    required:
                      - selector
                      - type
                    additionalProperties: false
              required:
                - mode
                - fields
              additionalProperties: false
            - type: object
              properties:
                mode:
                  type: string
                  enum:
                    - schema
                schema:
                  type: object
                  propertyNames:
                    type: string
                  additionalProperties:
                    type: string
              required:
                - mode
                - schema
              additionalProperties: false
            - type: object
              properties:
                mode:
                  type: string
                  enum:
                    - instruction
                instruction:
                  type: string
                  minLength: 1
              required:
                - mode
                - instruction
              additionalProperties: false
        timeout_ms:
          type: integer
          exclusiveMinimum: 0
          maximum: 300000
      required:
        - seeds
      additionalProperties: false
    CrawlResponse:
      type: object
      properties:
        id:
          type: string
        request_id:
          type: string
        kind:
          type: string
          enum:
            - crawl
        mode:
          type: string
          enum:
            - inline
        status:
          type: string
          enum:
            - completed
            - partial
        steps:
          type: array
          items:
            type: object
            properties:
              step_id:
                type: string
              type:
                type: string
              status:
                type: string
                enum:
                  - completed
                  - failed
              started_at:
                type: string
              completed_at:
                type: string
              error:
                type: object
                properties:
                  code:
                    type: string
                  message:
                    type: string
                required:
                  - code
                  - message
            required:
              - step_id
              - type
              - status
              - started_at
              - completed_at
        result:
          type: object
          properties:
            seeds:
              type: array
              items:
                type: string
            pages:
              type: array
              items:
                type: object
                properties:
                  url:
                    type: string
                  depth:
                    type: number
                  discovered_from:
                    type: string
                  status:
                    type: string
                    enum:
                      - completed
                      - partial
                      - failed
                  outputs:
                    type: object
                    propertyNames:
                      type: string
                    additionalProperties: {}
                  error:
                    type: object
                    properties:
                      code:
                        type: string
                      message:
                        type: string
                    required:
                      - code
                      - message
                required:
                  - url
                  - depth
                  - status
            summary:
              type: object
              properties:
                pages_discovered:
                  type: number
                pages_fetched:
                  type: number
                pages_succeeded:
                  type: number
                pages_failed:
                  type: number
                pages_skipped:
                  type: number
              required:
                - pages_discovered
                - pages_fetched
                - pages_succeeded
                - pages_failed
                - pages_skipped
          required:
            - seeds
            - pages
            - summary
        diagnostics:
          type: object
          properties:
            step_errors:
              type: array
              items:
                type: object
                properties:
                  step_id:
                    type: string
                  code:
                    type: string
                  message:
                    type: string
                  url:
                    type: string
                required:
                  - step_id
                  - code
                  - message
          required:
            - step_errors
      required:
        - id
        - request_id
        - kind
        - mode
        - status
        - steps
        - result
        - diagnostics
    ErrorResponse:
      type: object
      properties:
        request_id:
          type: string
        error:
          type: object
          properties:
            code:
              type: string
            message:
              type: string
          required:
            - code
            - message
        diagnostics:
          type: object
          properties:
            outcome:
              type: string
              enum:
                - failed
            retryable:
              type: boolean
            blocked:
              type: boolean
            timed_out:
              type: boolean
          required:
            - outcome
            - retryable
          additionalProperties: false
      required:
        - request_id
        - error
      additionalProperties: false
  securitySchemes:
    bearerAuth:
      type: http
      scheme: bearer
      bearerFormat: API Key

````