> ## Documentation Index
> Fetch the complete documentation index at: https://doc.lucidworks.com/llms.txt
> Use this file to discover all available pages before exploring further.

# Data Augmentation Job

export const schema = {
  "type": "object",
  "title": "Data Augmentation",
  "description": "Use this job to perform Text Augmentation",
  "required": ["id", "trainingCollection", "trainingFormat", "outputCollection", "outputFormat", "includeOriginalData", "type"],
  "properties": {
    "id": {
      "type": "string",
      "title": "Job ID",
      "description": "The ID for this job. Used in the API to reference this job. Allowed characters: a-z, A-Z, dash (-) and underscore (_)",
      "maxLength": 63,
      "pattern": "[a-zA-Z][_\\-a-zA-Z0-9]*[a-zA-Z0-9]?"
    },
    "sparkConfig": {
      "type": "array",
      "title": "Additional parameters",
      "description": "Provide additional key/value pairs to be injected into the training JSON map at runtime. Values will be inserted as-is, so use \" to surround string values",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "writeOptions": {
      "type": "array",
      "title": "Write Options",
      "description": "Options used when writing output to Solr or other sources",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "readOptions": {
      "type": "array",
      "title": "Read Options",
      "description": "Options used when reading input from Solr or other sources.",
      "hints": ["advanced"],
      "items": {
        "type": "object",
        "required": ["key"],
        "properties": {
          "key": {
            "type": "string",
            "title": "Parameter Name"
          },
          "value": {
            "type": "string",
            "title": "Parameter Value"
          }
        }
      }
    },
    "trainingCollection": {
      "type": "string",
      "title": "Input path",
      "description": "Solr collection or cloud storage path where training data is present.",
      "minLength": 1
    },
    "trainingFormat": {
      "type": "string",
      "title": "Input format",
      "description": "The format of the training data - solr, parquet etc.",
      "minLength": 1
    },
    "trainingDataFilterQuery": {
      "type": "string",
      "title": "Training Data Filter Query",
      "description": "Solr or SQL query to filter training data. Use solr query when solr collection is specified in Training Path. Use SQL query when cloud storage location is specified. The table name for SQL is `spark_input`",
      "hints": ["code/sql", "advanced"]
    },
    "randomSeed": {
      "type": "integer",
      "title": "Random Seed",
      "description": "Pseudorandom determinism fixed by keeping this seed constant",
      "default": 12345,
      "hints": ["advanced"]
    },
    "trainingSampleFraction": {
      "type": "number",
      "title": "Training Data Sampling Fraction",
      "description": "Choose a fraction of the data for training.",
      "default": 1,
      "hints": ["advanced"],
      "maximum": 1,
      "exclusiveMaximum": false
    },
    "batchSize": {
      "type": "string",
      "title": "Batch Size",
      "description": "If writing to solr, this field defines the batch size for documents to be pushed to solr.",
      "default": "15000",
      "hints": ["advanced"]
    },
    "outputCollection": {
      "type": "string",
      "title": "Output path",
      "description": "Output collection to store generated augmented data.",
      "minLength": 1
    },
    "outputFormat": {
      "type": "string",
      "title": "Output Format",
      "description": "The format of the output data - solr, parquet etc.",
      "minLength": 1
    },
    "partitionFields": {
      "type": "string",
      "title": "Partition fields",
      "description": "If writing to non-Solr sources, this field will accept a comma-delimited list of column names for partitioning the dataframe before writing to the external output ",
      "hints": ["advanced"]
    },
    "secretName": {
      "type": "string",
      "title": "Cloud storage secret name",
      "description": "Name of the secret used to access cloud storage as defined in the K8s namespace",
      "hints": ["advanced"],
      "minLength": 1
    },
    "backTranslations": {
      "type": "array",
      "title": "Back Translation",
      "description": "Augment data via translation to a different language and then translating back to original language. Chain of languages can be used for translation. Works at sentence level for medium-long length text. GPU recommended and will be used when available.",
      "items": {
        "type": "object",
        "required": ["fieldname", "inputLanguage"],
        "properties": {
          "fieldname": {
            "type": "string",
            "title": "Field Name",
            "description": "Name of the input field to augment.",
            "minLength": 1
          },
          "inputLanguage": {
            "type": "string",
            "title": "Input data Language",
            "description": "Language of input data.",
            "enum": ["English", "French", "German", "Italian", "Spanish", "Dutch", "Polish", "Hebrew", "Ukrainian", "Chinese", "Japanese", "Korean"],
            "minLength": 1
          },
          "intermediateLanguage": {
            "type": "string",
            "title": "Intermediate Language",
            "description": "Specify languages in order to be used in back translation separated by comma. Only use languages present in input data language dropdown. Bigger chains will take more time to augment. ",
            "default": "German",
            "pattern": "((?:English|German|French|Italian|Spanish|Dutch|Polish|Ukrainian|Hebrew|Chinese|Japanese|Korean)*(\\s)*(,)*(\\s)*){0,12}"
          },
          "batchSize": {
            "type": "integer",
            "title": "Batch Size",
            "description": "Number of input data samples to back-translate at a time. Important if Use GPU is checked to avoid memory overflow.",
            "default": 256,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "beamSize": {
            "type": "integer",
            "title": "Beam Size",
            "description": "Number of beams to evaluate during translation. Use higher number if translation is poor. Higher number will increase execution time and memory use.",
            "default": 1,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "minSentenceLength": {
            "type": "integer",
            "title": "Min translation length (tokens)",
            "description": "Do not back translate sentences shorter than specified length in tokens. If the value is more than max translation length, then max translation length will be used.",
            "default": 40,
            "hints": ["advanced"],
            "maximum": 510,
            "exclusiveMaximum": false,
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "maxSentenceLength": {
            "type": "integer",
            "title": "Max translation length (tokens)",
            "description": "Do not back translate sentences longer than specified length in tokens. If the value is less than min translation length, hen min translation length will be used.",
            "default": 240,
            "hints": ["advanced"],
            "maximum": 510,
            "exclusiveMaximum": false,
            "minimum": 0,
            "exclusiveMinimum": false
          }
        }
      }
    },
    "keyStrokeMisspellings": {
      "type": "array",
      "title": "Keystroke Misspellings",
      "description": "Augment data via insertion, substitution, swapping and deletion of characters based on keyboard layout. Useful for short text.",
      "items": {
        "type": "object",
        "required": ["fieldname", "inputLanguage"],
        "properties": {
          "fieldname": {
            "type": "string",
            "title": "Field Name",
            "description": "Name of the input field to augment.",
            "minLength": 1
          },
          "inputLanguage": {
            "type": "string",
            "title": "Input data Language",
            "description": "Language of input data.",
            "enum": ["English", "French", "German", "Italian", "Spanish", "Dutch", "Polish", "Hebrew", "Ukrainian"],
            "minLength": 1
          },
          "minCharAugment": {
            "type": "integer",
            "title": "Minimum Chars to Augment",
            "description": "Minimum number of characters to augment in each word. If the value is more than Maximum Chars to Augment, then Maximum Chars to Augment will be used.",
            "default": 1,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "maxCharAugment": {
            "type": "integer",
            "title": "Maximum Chars to Augment",
            "description": "Maximum number of characters to augment in each word. If the value is more than Minimum Chars to Augment, then Minimum Chars to Augment will be used.",
            "default": 2,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "minWordsToAugment": {
            "type": "integer",
            "title": "Min words to Augment",
            "description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
            "default": 2,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "maxWordsToAugment": {
            "type": "integer",
            "title": "Max words to Augment",
            "description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
            "default": 10,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "wordPercentageToAugment": {
            "type": "number",
            "title": "Percentage words to Augment",
            "description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
            "default": 0.2,
            "hints": ["advanced"],
            "maximum": 1,
            "exclusiveMaximum": false
          },
          "keywordsBlobName": {
            "type": "string",
            "title": "Keystroke Mapping",
            "description": "Keystroke Mapping for required language in JSON format from blob store.",
            "hints": ["advanced"],
            "reference": "blob",
            "blobType": "file:spark"
          }
        }
      }
    },
    "synonymSubstitutions": {
      "type": "array",
      "title": "Synonym Substitution",
      "description": "Augment data via substituting words using synonyms from wordnet or user supplied dictionary. Useful for short, medium and long text. Faster and less resource intensive than back translation.",
      "items": {
        "type": "object",
        "required": ["fieldname", "inputLanguage"],
        "properties": {
          "fieldname": {
            "type": "string",
            "title": "Field Name",
            "description": "Name of the input field to augment.",
            "minLength": 1
          },
          "inputLanguage": {
            "type": "string",
            "title": "Input data Language",
            "description": "Language of input data.",
            "enum": ["English", "French", "German", "Italian", "Spanish", "Dutch", "Polish", "Hebrew", "Chinese", "Japanese"],
            "minLength": 1
          },
          "minWordsToAugment": {
            "type": "integer",
            "title": "Min words to Augment",
            "description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
            "default": 2,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "maxWordsToAugment": {
            "type": "integer",
            "title": "Max words to Augment",
            "description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
            "default": 10,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "wordPercentageToAugment": {
            "type": "number",
            "title": "Percentage of words to Augment",
            "description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
            "default": 0.2,
            "hints": ["advanced"],
            "maximum": 1,
            "exclusiveMaximum": false
          },
          "stopwordsBlobName": {
            "type": "string",
            "title": "Synonym Dictionary Name",
            "description": "Wordnet format dictionary to use from blob store",
            "hints": ["advanced"],
            "reference": "blob",
            "blobType": "file:spark"
          }
        }
      }
    },
    "splitWords": {
      "type": "array",
      "title": "Split Words",
      "description": "Augment data via splitting some words. Useful for short, medium and long text.",
      "items": {
        "type": "object",
        "required": ["fieldname", "inputLanguage"],
        "properties": {
          "fieldname": {
            "type": "string",
            "title": "Field Name",
            "description": "Name of the input field to augment.",
            "minLength": 1
          },
          "inputLanguage": {
            "type": "string",
            "title": "Input data Language",
            "description": "Language of input data.",
            "enum": ["English", "French", "German", "Italian", "Spanish", "Dutch", "Polish"],
            "minLength": 1
          },
          "minWordLength": {
            "type": "integer",
            "title": "Minimum Word Length",
            "description": "Do not augment words less than this length (in characters). If the value is more than maximum word length, then maximum word length will be used.",
            "default": 4,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "minWordsToAugment": {
            "type": "integer",
            "title": "Min words to Augment",
            "description": "Minimum number of words to be augmented in input text. It should be less than maximum words to augment otherwise max value will be used. Suggested value is 2.",
            "default": 2,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "maxWordsToAugment": {
            "type": "integer",
            "title": "Max words to Augment",
            "description": "Maximum number of words to be augmented in input text.It should be less than minimum words to augment otherwise min value will be auto-adjusted. Suggested value is 10.",
            "default": 10,
            "hints": ["advanced"],
            "minimum": 0,
            "exclusiveMinimum": false
          },
          "wordPercentageToAugment": {
            "type": "number",
            "title": "Percentage of words to Augment",
            "description": "Percentage of words in input text to augment. If specified this will be used instead if minimum/maximum number of words to augment value.",
            "default": 0.2,
            "hints": ["advanced"],
            "maximum": 1,
            "exclusiveMaximum": false
          }
        }
      }
    },
    "includeOriginalData": {
      "type": "boolean",
      "title": "Include original data",
      "description": "When checked original data will be included in the augmented dataset",
      "default": true
    },
    "type": {
      "type": "string",
      "title": "Spark Job Type",
      "enum": ["argo-data-augmentation"],
      "default": "argo-data-augmentation",
      "hints": ["readonly"]
    }
  },
  "additionalProperties": true,
  "category": "Other",
  "categoryPriority": 1,
  "propertyGroups": [{
    "label": "Input/Output Parameters",
    "properties": ["trainingCollection", "trainingFormat", "trainingDataFilterQuery", "trainingSampleFraction", "randomSeed", "batchSize", "outputCollection", "outputFormat", "partitionFields", "secretName", "includeOriginalData"]
  }, {
    "label": "Augmentation Parameters",
    "properties": ["backTranslations", "keyStrokeMisspellings", "synonymSubstitutions", "splitWords"]
  }]
};

export const SchemaParamFields = ({schema}) => {
  const sanitize = str => {
    if (typeof str !== "string") return str;
    return str.replace(/^"(.*)"$/s, "$1").replace(/\\/g, "").replace(/"/g, "'");
  };
  const formatDescription = str => {
    const s = sanitize(str);
    return (/[.!?]\)*$/).test(s) ? s : `${s}.`;
  };
  const {description, properties = {}, required: requiredProps = []} = schema;
  const visibleProps = useMemo(() => Object.entries(properties).filter(([, prop]) => !prop.hints?.includes("hidden")), [properties]);
  return <div>
      {description && <p>{formatDescription(description)}</p>}

      {visibleProps.map(([name, prop]) => {
    const isRequired = requiredProps.includes(name);
    const hasDefault = prop.default !== undefined;
    const rawDefault = prop.default;
    const isComplexDefault = hasDefault && (typeof rawDefault === "object" || typeof rawDefault === "string" && (rawDefault.length > 20 || rawDefault.includes('"')));
    const fieldProps = {
      key: name,
      body: prop.title || name,
      type: prop.type,
      ...prop.title && ({
        post: [<><span className="text-stone-400 dark:text-stone-500">API property: </span>{name}</>]
      }),
      ...isRequired && ({
        required: true
      }),
      ...!isComplexDefault && hasDefault ? {
        default: sanitize(String(rawDefault))
      } : {}
    };
    const isObject = prop.type === "object" && prop.properties;
    const isArrayOfObjects = prop.type === "array" && prop.items?.type === "object" && prop.items.properties;
    return <ParamField {...fieldProps}>
            {prop.description && <p>{formatDescription(prop.description)}</p>}

            {isComplexDefault && <div className="flex">
                <p>
                  <strong>Default:</strong>
                </p>
                <pre className="!my-0">
                  <code>
                    {JSON.stringify(rawDefault, null, 2)}
                  </code>
                </pre>
              </div>}

            {isArrayOfObjects && <div className="flex">
              <p>
                <strong>Object attributes:</strong>
              </p>
              <pre className="!my-0">
                <code>
                  {'{\n'}
                  {Object.entries(prop.items.properties).map(([iname, iprop]) => <>
                      {`  ${iname}`}
                      {prop.items?.required?.includes(iname) && <span style={{
      color: 'red'
    }}> required</span>}
                      {`: {\n    display name: ${sanitize(iprop.title || '')}\n    type: ${iprop.type}\n  }\n`}
                    </>)}
                  {'}'}
                </code>
              </pre>
              </div>}

            {isObject && <Expandable title="properties">
                <SchemaParamFields schema={{
      properties: prop.properties,
      required: prop.required
    }} />
              </Expandable>}
          </ParamField>;
  })}
    </div>;
};

export const LwTemplate = ({title = "Key questions to get you started", icon = "sparkles", cta = "Powered by Agent Studio", linkHref = "https://lucidworks.com/demo/?utm_source=docs&utm_medium=referral&utm_campaign=docs_cta_ai"}) => {
  const [isLoaded, setIsLoaded] = useState(false);
  useEffect(() => {
    const timer = setTimeout(() => {
      setIsLoaded(true);
    }, 500);
    return () => clearTimeout(timer);
  }, []);
  return <div className="lw-template-container">
      <Card title={title} icon={icon}>
        {isLoaded && <span dangerouslySetInnerHTML={{
    __html: `<lw-template id="a029c1a9-28be-427e-b0e1-5d918920246a"></lw-template
            >`
  }} />}
        <Link href={linkHref} className="agent-studio-link text-left text-gray-600 gap-2 dark:text-gray-400 text-sm font-medium flex flex-row items-center hover:text-primary dark:hover:text-primary-light group-hover:text-primary group-hover:dark:text-primary-light">Powered by Lucidworks Agent Studio</Link>
      </Card>
    </div>;
};

[localhost link]: http://localhost:3000/docs/5/fusion/reference/config-ref/jobs/data-augmentation

[mintlify link]: https://doc.lucidworks.com/docs/5/fusion/reference/config-ref/jobs/data-augmentation

[old doc.lw link]: https://doc.lucidworks.com/fusion/5.9/11256

Use this job to augment training and/or testing data for use with other jobs, such as Smart Answer, Classification, Recommender.

This job takes in data specified by the user, performs one or more of the specified augmentation tasks, and writes the output back to Solr or Cloud.

<LwTemplate />

## The Benefits of Augmentation Tasks

The augmentation tasks can improve the models trained on it by adding the augmented data back into the model, thereby increasing the quantity of training data when there isn’t enough.
They can also allow you to test the robustness of the models by training them on the source data, then testing them on augmented data. Both introduce variation that will make the models
better equipped to handle different types of text. For more details on this process, see [Data Augmentation](https://en.wikipedia.org/wiki/Data_augmentation).

The amount of extra augmented data generated will depend on the task and the parameters used. In an ideal scenario with one task applied to one field and little to no record filtering,
you can expect to double the amount of the original data.

## Types of tasks

Each task supports a variety of languages. Refer to the description of each task for details.

* **Backtranslation**\
  Translates the input data into one or more intermediate languages before translating it back to the source language. The process introduces changes in the syntax and grammar of the input text without changing the semantics. Because this task uses a deep learning model, [Facebook’s M2M-100](https://arxiv.org/pdf/2010.11125.pdf), to perform translations, a GPU is recommended for fast processing.\
  If the backtranslation is of poor quality, try increasing the beam size. However, this will consume more memory and take more time. You could also try changing the intermediate languages to use languages that are similar to each other. For example, if your source language is Korean, translating to Chinese and/or Japanese and back might give you better results than translating to Spanish.\
  Use the synonym substitution job as an alternative if you’re unable to provision the necessary hardware and/or this job is taking too long. Note that the synonym substitution job does not support the same languages.

  **Supported Languages:** Chinese, Dutch, English, French, German, Hebrew, Italian, Japanese, Korean, Polish, Spanish, Ukrainian

  <Note>
    Backtranslation with Korean text may result in errors if run on GKE with Kubernetes master version v1.16.15-gke.4901, Kernel version: 4.19.112+, and Container runtime version: docker://19.3.1 on Google’s Container Optimized OS for Docker. To resolve, upgrade to a higher version of K8s master, kernel, and container runtime.
  </Note>

* **Synonym Substitution**\
  Takes in the input text and substitutes some words with synonyms derived from the included wordner/ppdb dictionaries or user-supplied dictionaries. The user-supplied dictionaries must be submitted in the lucene/solr synonym format as shown in the example below.\
  Example **synonyms.txt** file:

  ```
  #some test synonym mappings unlikely to appear in real input text
  aaa => aaaa
  bbb => bbbb1 bbbb2
  ccc => cccc1,cccc2
  a\=>a => b\=>b
  a\,a => b\,b
  fooaaa,baraaa,bazaaa

  # Some synonym groups specific to this example
  GB,gib,gigabyte,gigabytes  
    MB,mib,megabyte,megabytes  
    Television, Televisions, TV, TVs
  #notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
  #after us won't split it into two words.

  # Synonym mappings can be used for spelling correction too
  pixima => pixma
  ```

  **Supported Languages**: Chinese, Dutch, English, French, German, Hebrew, Italian, Japanese, Polish, Spanish

  <Note>
    Boosted synonyms are not supported. This synonym mapping file should be uploaded to the blob store, and a blob store path should then be passed to the job.
  </Note>

* **Keystroke Misspelling**\
  Simulates typos one might make based on the layout of the keyboard. For example, if typing in English on a QWERTY keyboard layout, they might accidentally replace the “y” with a “t” while typing the word “keyboard” because ”y” and “t” are next to each other on the keyboard. Currently, only QWERTY keyboard layouts are supported.\
  The user can provide their own keyboard mapping as a JSON file uploaded to the fusion blob store. The JSON file should be in the following format: `{“a”:”x”, “b”:”v”, …​}`.

  **Supported Languages**: Dutch, English, French, German, Hebrew, Italian, Polish, Spanish, Ukrainian

* **Split word**\
  Randomly splits words by introducing a space `“ “` at some random point in the word.

  **Supported Languages**: Dutch, English, French, German, Italian, Polish, Spanish

## Configuration properties

<SchemaParamFields schema={schema} />
