Create a custom analyzer
- Create a custom analyzer
  - Configuration
  - Example configuration

Create a custom analyzer

When the built-in analyzers do not fulfill your needs, you can create a custom analyzer which uses the appropriate combination of:

zero or more character filters
a tokenizer
zero or more token filters.

Configuration

The custom analyzer accepts the following parameters:

`type`	Analyzer type. Accepts built-in analyzer types. For custom analyzers, use `custom` or omit this parameter.
`tokenizer`	A built-in or customised tokenizer. (Required)
`char_filter`	An optional array of built-in or customised character filters.
`filter`	An optional array of built-in or customised token filters.
`position_increment_gap`	When indexing an array of text values, Elasticsearch inserts a fake “gap” between the last term of one value and the first term of the next value to ensure that a phrase query doesn’t match two terms from different array elements. Defaults to `100`. See position_increment_gap for more.

Example configuration

Here is an example that combines the following:

Character Filter

HTML Strip Character Filter

Tokenizer

Standard Tokenizer

Token Filters

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_custom_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "char_filter": [
                        "html_strip"
                    ],
                    "filter": [
                        "lowercase",
                        "asciifolding"
                    ]
                }
            }
        }
    },
)
print(resp)
resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_custom_analyzer",
    text="Is this déjà vu</b>?",
)
print(resp1)

response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_custom_analyzer: {
            type: 'custom',
            tokenizer: 'standard',
            char_filter: [
              'html_strip'
            ],
            filter: [
              'lowercase',
              'asciifolding'
            ]
          }
        }
      }
    }
  }
)
puts response
response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_custom_analyzer',
    text: 'Is this déjà vu</b>?'
  }
)
puts response

const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_custom_analyzer: {
          type: "custom",
          tokenizer: "standard",
          char_filter: ["html_strip"],
          filter: ["lowercase", "asciifolding"],
        },
      },
    },
  },
});
console.log(response);
const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_custom_analyzer",
  text: "Is this déjà vu</b>?",
});
console.log(response1);

PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": {
          "type": "custom", 
          "tokenizer": "standard",
          "char_filter": [
            "html_strip"
          ],
          "filter": [
            "lowercase",
            "asciifolding"
          ]
        }
      }
    }
  }
}
POST my-index-000001/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text": "Is this <b>déjà vu</b>?"
}

For custom analyzers, use a type of custom or omit the type parameter.

The above example produces the following terms:

[ is, this, deja, vu ]

The previous example used tokenizer, token filters, and character filters with their default configurations, but it is possible to create configured versions of each and to use them in a custom analyzer.

Here is a more complicated example that combines the following:

Character Filter

Mapping Character Filter, configured to replace :) with _happy_ and :( with _sad_

Tokenizer

Pattern Tokenizer, configured to split on punctuation characters

Token Filters

Lowercase Token Filter
Stop Token Filter, configured to use the pre-defined list of English stop words

Here is an example:

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_custom_analyzer": {
                    "char_filter": [
                        "emoticons"
                    ],
                    "tokenizer": "punctuation",
                    "filter": [
                        "lowercase",
                        "english_stop"
                    ]
                }
            },
            "tokenizer": {
                "punctuation": {
                    "type": "pattern",
                    "pattern": "[ .,!?]"
                }
            },
            "char_filter": {
                "emoticons": {
                    "type": "mapping",
                    "mappings": [
                        ":) => _happy_",
                        ":( => _sad_"
                    ]
                }
            },
            "filter": {
                "english_stop": {
                    "type": "stop",
                    "stopwords": "_english_"
                }
            }
        }
    },
)
print(resp)
resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_custom_analyzer",
    text="I'm a :) person, and you?",
)
print(resp1)

response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_custom_analyzer: {
            char_filter: [
              'emoticons'
            ],
            tokenizer: 'punctuation',
            filter: [
              'lowercase',
              'english_stop'
            ]
          }
        },
        tokenizer: {
          punctuation: {
            type: 'pattern',
            pattern: '[ .,!?]'
          }
        },
        char_filter: {
          emoticons: {
            type: 'mapping',
            mappings: [
              ':) => _happy_',
              ':( => _sad_'
            ]
          }
        },
        filter: {
          english_stop: {
            type: 'stop',
            stopwords: '_english_'
          }
        }
      }
    }
  }
)
puts response
response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_custom_analyzer',
    text: "I'm a :) person, and you?"
  }
)
puts response

const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_custom_analyzer: {
          char_filter: ["emoticons"],
          tokenizer: "punctuation",
          filter: ["lowercase", "english_stop"],
        },
      },
      tokenizer: {
        punctuation: {
          type: "pattern",
          pattern: "[ .,!?]",
        },
      },
      char_filter: {
        emoticons: {
          type: "mapping",
          mappings: [":) => _happy_", ":( => _sad_"],
        },
      },
      filter: {
        english_stop: {
          type: "stop",
          stopwords: "_english_",
        },
      },
    },
  },
});
console.log(response);
const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_custom_analyzer",
  text: "I'm a :) person, and you?",
});
console.log(response1);

PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_custom_analyzer": { 
          "char_filter": [
            "emoticons"
          ],
          "tokenizer": "punctuation",
          "filter": [
            "lowercase",
            "english_stop"
          ]
        }
      },
      "tokenizer": {
        "punctuation": { 
          "type": "pattern",
          "pattern": "[ .,!?]"
        }
      },
      "char_filter": {
        "emoticons": { 
          "type": "mapping",
          "mappings": [
            ":) => _happy_",
            ":( => _sad_"
          ]
        }
      },
      "filter": {
        "english_stop": { 
          "type": "stop",
          "stopwords": "_english_"
        }
      }
    }
  }
}
POST my-index-000001/_analyze
{
  "analyzer": "my_custom_analyzer",
  "text": "I'm a :) person, and you?"
}

	Assigns the index a default custom analyzer, `my_custom_analyzer`. This analyzer uses a custom tokenizer, character filter, and token filter that are defined later in the request. This analyzer also omits the `type` parameter.
	Defines the custom `punctuation` tokenizer.
	Defines the custom `emoticons` character filter.
	Defines the custom `english_stop` token filter.

The above example produces the following terms:

[ i'm, _happy_, person, you ]