Create a custom analyzer

Create a custom analyzer

When the built-in analyzers do not fulfill your needs, you can create a custom analyzer which uses the appropriate combination of:

Configuration

The custom analyzer accepts the following parameters:

type

Analyzer type. Accepts built-in analyzer types. For custom analyzers, use custom or omit this parameter.

tokenizer

A built-in or customised tokenizer. (Required)

char_filter

An optional array of built-in or customised character filters.

filter

An optional array of built-in or customised token filters.

position_increment_gap

When indexing an array of text values, Elasticsearch inserts a fake “gap” between the last term of one value and the first term of the next value to ensure that a phrase query doesn’t match two terms from different array elements. Defaults to 100. See position_increment_gap for more.

Example configuration

Here is an example that combines the following:

Character Filter

Tokenizer

Token Filters

  1. resp = client.indices.create(
  2. index="my-index-000001",
  3. settings={
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "type": "custom",
  8. "tokenizer": "standard",
  9. "char_filter": [
  10. "html_strip"
  11. ],
  12. "filter": [
  13. "lowercase",
  14. "asciifolding"
  15. ]
  16. }
  17. }
  18. }
  19. },
  20. )
  21. print(resp)
  22. resp1 = client.indices.analyze(
  23. index="my-index-000001",
  24. analyzer="my_custom_analyzer",
  25. text="Is this déjà vu</b>?",
  26. )
  27. print(resp1)
  1. response = client.indices.create(
  2. index: 'my-index-000001',
  3. body: {
  4. settings: {
  5. analysis: {
  6. analyzer: {
  7. my_custom_analyzer: {
  8. type: 'custom',
  9. tokenizer: 'standard',
  10. char_filter: [
  11. 'html_strip'
  12. ],
  13. filter: [
  14. 'lowercase',
  15. 'asciifolding'
  16. ]
  17. }
  18. }
  19. }
  20. }
  21. }
  22. )
  23. puts response
  24. response = client.indices.analyze(
  25. index: 'my-index-000001',
  26. body: {
  27. analyzer: 'my_custom_analyzer',
  28. text: 'Is this déjà vu</b>?'
  29. }
  30. )
  31. puts response
  1. const response = await client.indices.create({
  2. index: "my-index-000001",
  3. settings: {
  4. analysis: {
  5. analyzer: {
  6. my_custom_analyzer: {
  7. type: "custom",
  8. tokenizer: "standard",
  9. char_filter: ["html_strip"],
  10. filter: ["lowercase", "asciifolding"],
  11. },
  12. },
  13. },
  14. },
  15. });
  16. console.log(response);
  17. const response1 = await client.indices.analyze({
  18. index: "my-index-000001",
  19. analyzer: "my_custom_analyzer",
  20. text: "Is this déjà vu</b>?",
  21. });
  22. console.log(response1);
  1. PUT my-index-000001
  2. {
  3. "settings": {
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "type": "custom",
  8. "tokenizer": "standard",
  9. "char_filter": [
  10. "html_strip"
  11. ],
  12. "filter": [
  13. "lowercase",
  14. "asciifolding"
  15. ]
  16. }
  17. }
  18. }
  19. }
  20. }
  21. POST my-index-000001/_analyze
  22. {
  23. "analyzer": "my_custom_analyzer",
  24. "text": "Is this <b>déjà vu</b>?"
  25. }

For custom analyzers, use a type of custom or omit the type parameter.

The above example produces the following terms:

  1. [ is, this, deja, vu ]

The previous example used tokenizer, token filters, and character filters with their default configurations, but it is possible to create configured versions of each and to use them in a custom analyzer.

Here is a more complicated example that combines the following:

Character Filter

Tokenizer

Token Filters

Here is an example:

  1. resp = client.indices.create(
  2. index="my-index-000001",
  3. settings={
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "char_filter": [
  8. "emoticons"
  9. ],
  10. "tokenizer": "punctuation",
  11. "filter": [
  12. "lowercase",
  13. "english_stop"
  14. ]
  15. }
  16. },
  17. "tokenizer": {
  18. "punctuation": {
  19. "type": "pattern",
  20. "pattern": "[ .,!?]"
  21. }
  22. },
  23. "char_filter": {
  24. "emoticons": {
  25. "type": "mapping",
  26. "mappings": [
  27. ":) => _happy_",
  28. ":( => _sad_"
  29. ]
  30. }
  31. },
  32. "filter": {
  33. "english_stop": {
  34. "type": "stop",
  35. "stopwords": "_english_"
  36. }
  37. }
  38. }
  39. },
  40. )
  41. print(resp)
  42. resp1 = client.indices.analyze(
  43. index="my-index-000001",
  44. analyzer="my_custom_analyzer",
  45. text="I'm a :) person, and you?",
  46. )
  47. print(resp1)
  1. response = client.indices.create(
  2. index: 'my-index-000001',
  3. body: {
  4. settings: {
  5. analysis: {
  6. analyzer: {
  7. my_custom_analyzer: {
  8. char_filter: [
  9. 'emoticons'
  10. ],
  11. tokenizer: 'punctuation',
  12. filter: [
  13. 'lowercase',
  14. 'english_stop'
  15. ]
  16. }
  17. },
  18. tokenizer: {
  19. punctuation: {
  20. type: 'pattern',
  21. pattern: '[ .,!?]'
  22. }
  23. },
  24. char_filter: {
  25. emoticons: {
  26. type: 'mapping',
  27. mappings: [
  28. ':) => _happy_',
  29. ':( => _sad_'
  30. ]
  31. }
  32. },
  33. filter: {
  34. english_stop: {
  35. type: 'stop',
  36. stopwords: '_english_'
  37. }
  38. }
  39. }
  40. }
  41. }
  42. )
  43. puts response
  44. response = client.indices.analyze(
  45. index: 'my-index-000001',
  46. body: {
  47. analyzer: 'my_custom_analyzer',
  48. text: "I'm a :) person, and you?"
  49. }
  50. )
  51. puts response
  1. const response = await client.indices.create({
  2. index: "my-index-000001",
  3. settings: {
  4. analysis: {
  5. analyzer: {
  6. my_custom_analyzer: {
  7. char_filter: ["emoticons"],
  8. tokenizer: "punctuation",
  9. filter: ["lowercase", "english_stop"],
  10. },
  11. },
  12. tokenizer: {
  13. punctuation: {
  14. type: "pattern",
  15. pattern: "[ .,!?]",
  16. },
  17. },
  18. char_filter: {
  19. emoticons: {
  20. type: "mapping",
  21. mappings: [":) => _happy_", ":( => _sad_"],
  22. },
  23. },
  24. filter: {
  25. english_stop: {
  26. type: "stop",
  27. stopwords: "_english_",
  28. },
  29. },
  30. },
  31. },
  32. });
  33. console.log(response);
  34. const response1 = await client.indices.analyze({
  35. index: "my-index-000001",
  36. analyzer: "my_custom_analyzer",
  37. text: "I'm a :) person, and you?",
  38. });
  39. console.log(response1);
  1. PUT my-index-000001
  2. {
  3. "settings": {
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "char_filter": [
  8. "emoticons"
  9. ],
  10. "tokenizer": "punctuation",
  11. "filter": [
  12. "lowercase",
  13. "english_stop"
  14. ]
  15. }
  16. },
  17. "tokenizer": {
  18. "punctuation": {
  19. "type": "pattern",
  20. "pattern": "[ .,!?]"
  21. }
  22. },
  23. "char_filter": {
  24. "emoticons": {
  25. "type": "mapping",
  26. "mappings": [
  27. ":) => _happy_",
  28. ":( => _sad_"
  29. ]
  30. }
  31. },
  32. "filter": {
  33. "english_stop": {
  34. "type": "stop",
  35. "stopwords": "_english_"
  36. }
  37. }
  38. }
  39. }
  40. }
  41. POST my-index-000001/_analyze
  42. {
  43. "analyzer": "my_custom_analyzer",
  44. "text": "I'm a :) person, and you?"
  45. }

Assigns the index a default custom analyzer, my_custom_analyzer. This analyzer uses a custom tokenizer, character filter, and token filter that are defined later in the request. This analyzer also omits the type parameter.

Defines the custom punctuation tokenizer.

Defines the custom emoticons character filter.

Defines the custom english_stop token filter.

The above example produces the following terms:

  1. [ i'm, _happy_, person, you ]