> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# SQL Aggregation Jobs

export const schema = {
  "type": "object",
  "title": "SQL Aggregation",
  "description": "Use this job when you want to aggregate your data in some way.",
  "required": ["id", "inputCollection", "sql", "dataFormat", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Spark Job ID",
      "description": "The ID for this Spark job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_). Maximum length: 63 characters.",
      "maxLength": 63,
      "pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
    },
    "sparkConfig": {
      "type": "array",
      "title": "Spark Settings",
      "description": "Spark configuration settings.",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "inputCollection": {
      "type": "string",
      "title": "Source Collection",
      "description": "Collection containing signals to be aggregated."
    },
    "outputCollection": {
      "type": "string",
      "title": "Output Collection",
      "description": "The collection to write the aggregates to on output. This property is required if the selected output / rollup pipeline requires it (the default pipeline does). A special value of '-' disables the output.",
      "hints": ["advanced"],
      "minLength": 1
    },
    "rows": {
      "type": "integer",
      "title": "Batch Size",
      "description": "Number of rows to read from the source collection per request.",
      "default": 10000,
      "hints": ["advanced"]
    },
    "sql": {
      "type": "string",
      "title": "SQL",
      "description": "Use SQL to perform the aggregation. You do not need to include a time range filter in the WHERE clause as it gets applied automatically before executing the SQL statement.",
      "hints": ["lengthy", "code/sql"],
      "minLength": 1
    },
    "rollupSql": {
      "type": "string",
      "title": "Rollup SQL",
      "description": "Use SQL to perform a rollup of previously aggregated docs. If left blank, the aggregation framework will supply a default SQL query to rollup aggregated metrics.",
      "hints": ["lengthy", "code/sql", "advanced"],
      "minLength": 1
    },
    "readOptions": {
      "type": "array",
      "title": "Read Options",
      "description": "Additional configuration settings to fine-tune how input records are read for this aggregation.",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "sourceCatchup": {
      "type": "boolean",
      "title": "Aggregate New and Merge with Existing",
      "description": "If checked, only aggregate new signals created since the last time the job was successfully run. If there is a record of such previous run then this overrides the starting time of time range set in 'timeRange' property. If unchecked, then all matching signals are aggregated and any previously aggregated docs are deleted to avoid double counting.",
      "default": true,
      "hints": ["advanced"]
    },
    "sourceRemove": {
      "type": "boolean",
      "title": "Remove signals from source",
      "description": "If checked, remove signals from source collection once aggregation job has finished running.",
      "default": false,
      "hints": ["advanced"]
    },
    "aggregationTime": {
      "type": "string",
      "title": "Aggregation Time",
      "description": "Timestamp to use for the aggregation results. Defaults to NOW.",
      "hints": ["advanced"],
      "format": "date-time"
    },
    "referenceTime": {
      "type": "string",
      "title": "Reference Time",
      "description": "Timestamp to use for computing decays and to determine the value of NOW.",
      "hints": ["advanced"],
      "format": "date-time"
    },
    "skipCheckEnabled": {
      "type": "boolean",
      "title": "Job Skip Check Enabled?",
      "description": "If the catch-up flag is enabled and this field is checked, the job framework will execute a fast Solr query to determine if this run can be skipped.",
      "default": true,
      "hints": ["advanced"]
    },
    "skipJobIfSignalsEmpty": {
      "type": "boolean",
      "title": "Skip Job run",
      "description": "Skip Job run if signals collection is empty",
      "hints": ["advanced"]
    },
    "parameters": {
      "type": "array",
      "title": "Parameters",
      "description": "Other aggregation parameters (e.g. timestamp field etc..).",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "signalTypes": {
      "type": "array",
      "title": "Signal Types",
      "description": "The signal types. If not set then any signal type is selected",
      "items": {
        "type": "string"
      }
    },
    "selectQuery": {
      "type": "string",
      "title": "Query",
      "description": "The query to select the desired input documents.",
      "default": "*:*",
      "hints": ["advanced"],
      "minLength": 1
    },
    "timeRange": {
      "type": "string",
      "title": "Time Range",
      "description": "The time range to select signals on.",
      "hints": ["advanced"],
      "minLength": 1
    },
    "useNaturalKey": {
      "type": "boolean",
      "title": "Use Natural Key?",
      "description": "Use a natural key provided in the raw signals data for aggregation, rather than relying on Solr UUIDs. Migrated aggregations jobs from Fusion 4 will need this set to false.",
      "default": true,
      "hints": ["advanced"]
    },
    "optimizeSegments": {
      "type": "integer",
      "title": "Optimize Segments",
      "description": "If set to a value above 0, the aggregator job will optimize the resulting Solr collection into this many segments",
      "default": 0,
      "hints": ["advanced"],
      "minimum": 0,
      "exclusiveMinimum": false
    },
    "dataFormat": {
      "type": "string",
      "title": "Data format",
      "description": "Spark-compatible format that contains training data (like 'solr', 'parquet', 'orc' etc)",
      "default": "solr",
      "minLength": 1
    },
    "sparkSQL": {
      "type": "string",
      "title": "Spark SQL filter query",
      "description": "Use this field to create a Spark SQL query for filtering your input data. The input data will be registered as spark_input",
      "default": "SELECT * from spark_input",
      "hints": ["code/sql", "advanced"]
    },
    "sparkPartitions": {
      "type": "integer",
      "title": "Set minimum Spark partitions for input",
      "description": "Spark will re-partition the input to have this number of partitions. Increase for greater parallelism",
      "default": 200,
      "hints": ["advanced"]
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["aggregation"],
      "default": "aggregation",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/5/fusion/reference/config-ref/jobs/aggregations/sql-aggregation

[mintlify link]: https://doc.lucidworks.com/docs/5/fusion/reference/config-ref/jobs/aggregations/sql-aggregation

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/8804

A Spark SQL aggregation job where user-defined parameters are injected into a built-in SQL template at runtime.

<LwTemplate />

## Configuration properties

<SchemaParamFields schema={schema} />
