Keyword marker token filter

Keyword marker token filter

Marks specified tokens as keywords, which are not stemmed.

The keyword_marker filter assigns specified tokens a keyword attribute of true. Stemmer token filters, such as stemmer or porter_stem, skip tokens with a keyword attribute of true.

To work properly, the keyword_marker filter must be listed before any stemmer token filters in the analyzer configuration.

The keyword_marker filter uses Lucene’s KeywordMarkerFilter.

Example

To see how the keyword_marker filter works, you first need to produce a token stream containing stemmed tokens.

The following analyze API request uses the stemmer filter to create stemmed tokens for fox running and jumping.

  1. resp = client.indices.analyze(
  2. tokenizer="whitespace",
  3. filter=[
  4. "stemmer"
  5. ],
  6. text="fox running and jumping",
  7. )
  8. print(resp)
  1. response = client.indices.analyze(
  2. body: {
  3. tokenizer: 'whitespace',
  4. filter: [
  5. 'stemmer'
  6. ],
  7. text: 'fox running and jumping'
  8. }
  9. )
  10. puts response
  1. const response = await client.indices.analyze({
  2. tokenizer: "whitespace",
  3. filter: ["stemmer"],
  4. text: "fox running and jumping",
  5. });
  6. console.log(response);
  1. GET /_analyze
  2. {
  3. "tokenizer": "whitespace",
  4. "filter": [ "stemmer" ],
  5. "text": "fox running and jumping"
  6. }

The request produces the following tokens. Note that running was stemmed to run and jumping was stemmed to jump.

  1. [ fox, run, and, jump ]

To prevent jumping from being stemmed, add the keyword_marker filter before the stemmer filter in the previous analyze API request. Specify jumping in the keywords parameter of the keyword_marker filter.

  1. resp = client.indices.analyze(
  2. tokenizer="whitespace",
  3. filter=[
  4. {
  5. "type": "keyword_marker",
  6. "keywords": [
  7. "jumping"
  8. ]
  9. },
  10. "stemmer"
  11. ],
  12. text="fox running and jumping",
  13. )
  14. print(resp)
  1. response = client.indices.analyze(
  2. body: {
  3. tokenizer: 'whitespace',
  4. filter: [
  5. {
  6. type: 'keyword_marker',
  7. keywords: [
  8. 'jumping'
  9. ]
  10. },
  11. 'stemmer'
  12. ],
  13. text: 'fox running and jumping'
  14. }
  15. )
  16. puts response
  1. const response = await client.indices.analyze({
  2. tokenizer: "whitespace",
  3. filter: [
  4. {
  5. type: "keyword_marker",
  6. keywords: ["jumping"],
  7. },
  8. "stemmer",
  9. ],
  10. text: "fox running and jumping",
  11. });
  12. console.log(response);
  1. GET /_analyze
  2. {
  3. "tokenizer": "whitespace",
  4. "filter": [
  5. {
  6. "type": "keyword_marker",
  7. "keywords": [ "jumping" ]
  8. },
  9. "stemmer"
  10. ],
  11. "text": "fox running and jumping"
  12. }

The request produces the following tokens. running is still stemmed to run, but jumping is not stemmed.

  1. [ fox, run, and, jumping ]

To see the keyword attribute for these tokens, add the following arguments to the analyze API request:

  • explain: true
  • attributes: keyword
  1. resp = client.indices.analyze(
  2. tokenizer="whitespace",
  3. filter=[
  4. {
  5. "type": "keyword_marker",
  6. "keywords": [
  7. "jumping"
  8. ]
  9. },
  10. "stemmer"
  11. ],
  12. text="fox running and jumping",
  13. explain=True,
  14. attributes="keyword",
  15. )
  16. print(resp)
  1. response = client.indices.analyze(
  2. body: {
  3. tokenizer: 'whitespace',
  4. filter: [
  5. {
  6. type: 'keyword_marker',
  7. keywords: [
  8. 'jumping'
  9. ]
  10. },
  11. 'stemmer'
  12. ],
  13. text: 'fox running and jumping',
  14. explain: true,
  15. attributes: 'keyword'
  16. }
  17. )
  18. puts response
  1. const response = await client.indices.analyze({
  2. tokenizer: "whitespace",
  3. filter: [
  4. {
  5. type: "keyword_marker",
  6. keywords: ["jumping"],
  7. },
  8. "stemmer",
  9. ],
  10. text: "fox running and jumping",
  11. explain: true,
  12. attributes: "keyword",
  13. });
  14. console.log(response);
  1. GET /_analyze
  2. {
  3. "tokenizer": "whitespace",
  4. "filter": [
  5. {
  6. "type": "keyword_marker",
  7. "keywords": [ "jumping" ]
  8. },
  9. "stemmer"
  10. ],
  11. "text": "fox running and jumping",
  12. "explain": true,
  13. "attributes": "keyword"
  14. }

The API returns the following response. Note the jumping token has a keyword attribute of true.

  1. {
  2. "detail": {
  3. "custom_analyzer": true,
  4. "charfilters": [],
  5. "tokenizer": {
  6. "name": "whitespace",
  7. "tokens": [
  8. {
  9. "token": "fox",
  10. "start_offset": 0,
  11. "end_offset": 3,
  12. "type": "word",
  13. "position": 0
  14. },
  15. {
  16. "token": "running",
  17. "start_offset": 4,
  18. "end_offset": 11,
  19. "type": "word",
  20. "position": 1
  21. },
  22. {
  23. "token": "and",
  24. "start_offset": 12,
  25. "end_offset": 15,
  26. "type": "word",
  27. "position": 2
  28. },
  29. {
  30. "token": "jumping",
  31. "start_offset": 16,
  32. "end_offset": 23,
  33. "type": "word",
  34. "position": 3
  35. }
  36. ]
  37. },
  38. "tokenfilters": [
  39. {
  40. "name": "__anonymous__keyword_marker",
  41. "tokens": [
  42. {
  43. "token": "fox",
  44. "start_offset": 0,
  45. "end_offset": 3,
  46. "type": "word",
  47. "position": 0,
  48. "keyword": false
  49. },
  50. {
  51. "token": "running",
  52. "start_offset": 4,
  53. "end_offset": 11,
  54. "type": "word",
  55. "position": 1,
  56. "keyword": false
  57. },
  58. {
  59. "token": "and",
  60. "start_offset": 12,
  61. "end_offset": 15,
  62. "type": "word",
  63. "position": 2,
  64. "keyword": false
  65. },
  66. {
  67. "token": "jumping",
  68. "start_offset": 16,
  69. "end_offset": 23,
  70. "type": "word",
  71. "position": 3,
  72. "keyword": true
  73. }
  74. ]
  75. },
  76. {
  77. "name": "stemmer",
  78. "tokens": [
  79. {
  80. "token": "fox",
  81. "start_offset": 0,
  82. "end_offset": 3,
  83. "type": "word",
  84. "position": 0,
  85. "keyword": false
  86. },
  87. {
  88. "token": "run",
  89. "start_offset": 4,
  90. "end_offset": 11,
  91. "type": "word",
  92. "position": 1,
  93. "keyword": false
  94. },
  95. {
  96. "token": "and",
  97. "start_offset": 12,
  98. "end_offset": 15,
  99. "type": "word",
  100. "position": 2,
  101. "keyword": false
  102. },
  103. {
  104. "token": "jumping",
  105. "start_offset": 16,
  106. "end_offset": 23,
  107. "type": "word",
  108. "position": 3,
  109. "keyword": true
  110. }
  111. ]
  112. }
  113. ]
  114. }
  115. }

Configurable parameters

ignore_case

(Optional, Boolean) If true, matching for the keywords and keywords_path parameters ignores letter case. Defaults to false.

keywords

(Required*, array of strings) Array of keywords. Tokens that match these keywords are not stemmed.

This parameter, keywords_path, or keywords_pattern must be specified. You cannot specify this parameter and keywords_pattern.

keywords_path

(Required*, string) Path to a file that contains a list of keywords. Tokens that match these keywords are not stemmed.

This path must be absolute or relative to the config location, and the file must be UTF-8 encoded. Each word in the file must be separated by a line break.

This parameter, keywords, or keywords_pattern must be specified. You cannot specify this parameter and keywords_pattern.

keywords_pattern

(Required*, string) Java regular expression used to match tokens. Tokens that match this expression are marked as keywords and not stemmed.

This parameter, keywords, or keywords_path must be specified. You cannot specify this parameter and keywords or keywords_pattern.

Poorly written regular expressions can cause Elasticsearch to run slowly or result in stack overflow errors, causing the running node to suddenly exit.

Customize and add to an analyzer

To customize the keyword_marker filter, duplicate it to create the basis for a new custom token filter. You can modify the filter using its configurable parameters.

For example, the following create index API request uses a custom keyword_marker filter and the porter_stem filter to configure a new custom analyzer.

The custom keyword_marker filter marks tokens specified in the analysis/example_word_list.txt file as keywords. The porter_stem filter does not stem these tokens.

  1. resp = client.indices.create(
  2. index="my-index-000001",
  3. settings={
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "type": "custom",
  8. "tokenizer": "standard",
  9. "filter": [
  10. "my_custom_keyword_marker_filter",
  11. "porter_stem"
  12. ]
  13. }
  14. },
  15. "filter": {
  16. "my_custom_keyword_marker_filter": {
  17. "type": "keyword_marker",
  18. "keywords_path": "analysis/example_word_list.txt"
  19. }
  20. }
  21. }
  22. },
  23. )
  24. print(resp)
  1. response = client.indices.create(
  2. index: 'my-index-000001',
  3. body: {
  4. settings: {
  5. analysis: {
  6. analyzer: {
  7. my_custom_analyzer: {
  8. type: 'custom',
  9. tokenizer: 'standard',
  10. filter: [
  11. 'my_custom_keyword_marker_filter',
  12. 'porter_stem'
  13. ]
  14. }
  15. },
  16. filter: {
  17. my_custom_keyword_marker_filter: {
  18. type: 'keyword_marker',
  19. keywords_path: 'analysis/example_word_list.txt'
  20. }
  21. }
  22. }
  23. }
  24. }
  25. )
  26. puts response
  1. const response = await client.indices.create({
  2. index: "my-index-000001",
  3. settings: {
  4. analysis: {
  5. analyzer: {
  6. my_custom_analyzer: {
  7. type: "custom",
  8. tokenizer: "standard",
  9. filter: ["my_custom_keyword_marker_filter", "porter_stem"],
  10. },
  11. },
  12. filter: {
  13. my_custom_keyword_marker_filter: {
  14. type: "keyword_marker",
  15. keywords_path: "analysis/example_word_list.txt",
  16. },
  17. },
  18. },
  19. },
  20. });
  21. console.log(response);
  1. PUT /my-index-000001
  2. {
  3. "settings": {
  4. "analysis": {
  5. "analyzer": {
  6. "my_custom_analyzer": {
  7. "type": "custom",
  8. "tokenizer": "standard",
  9. "filter": [
  10. "my_custom_keyword_marker_filter",
  11. "porter_stem"
  12. ]
  13. }
  14. },
  15. "filter": {
  16. "my_custom_keyword_marker_filter": {
  17. "type": "keyword_marker",
  18. "keywords_path": "analysis/example_word_list.txt"
  19. }
  20. }
  21. }
  22. }
  23. }