CJK bigram token filter

CJK bigram token filter

Forms bigrams out of CJK (Chinese, Japanese, and Korean) tokens.

This filter is included in Elasticsearch’s built-in CJK language analyzer. It uses Lucene’s CJKBigramFilter.

Example

The following analyze API request demonstrates how the CJK bigram token filter works.

  1. resp = client.indices.analyze(
  2. tokenizer="standard",
  3. filter=[
  4. "cjk_bigram"
  5. ],
  6. text="東京都は、日本の首都であり",
  7. )
  8. print(resp)
  1. response = client.indices.analyze(
  2. body: {
  3. tokenizer: 'standard',
  4. filter: [
  5. 'cjk_bigram'
  6. ],
  7. text: '東京都は、日本の首都であり'
  8. }
  9. )
  10. puts response
  1. const response = await client.indices.analyze({
  2. tokenizer: "standard",
  3. filter: ["cjk_bigram"],
  4. text: "東京都は、日本の首都であり",
  5. });
  6. console.log(response);
  1. GET /_analyze
  2. {
  3. "tokenizer" : "standard",
  4. "filter" : ["cjk_bigram"],
  5. "text" : "東京都は、日本の首都であり"
  6. }

The filter produces the following tokens:

  1. [ 東京, 京都, 都は, 日本, 本の, の首, 首都, 都で, であ, あり ]

Add to an analyzer

The following create index API request uses the CJK bigram token filter to configure a new custom analyzer.

  1. resp = client.indices.create(
  2. index="cjk_bigram_example",
  3. settings={
  4. "analysis": {
  5. "analyzer": {
  6. "standard_cjk_bigram": {
  7. "tokenizer": "standard",
  8. "filter": [
  9. "cjk_bigram"
  10. ]
  11. }
  12. }
  13. }
  14. },
  15. )
  16. print(resp)
  1. response = client.indices.create(
  2. index: 'cjk_bigram_example',
  3. body: {
  4. settings: {
  5. analysis: {
  6. analyzer: {
  7. standard_cjk_bigram: {
  8. tokenizer: 'standard',
  9. filter: [
  10. 'cjk_bigram'
  11. ]
  12. }
  13. }
  14. }
  15. }
  16. }
  17. )
  18. puts response
  1. const response = await client.indices.create({
  2. index: "cjk_bigram_example",
  3. settings: {
  4. analysis: {
  5. analyzer: {
  6. standard_cjk_bigram: {
  7. tokenizer: "standard",
  8. filter: ["cjk_bigram"],
  9. },
  10. },
  11. },
  12. },
  13. });
  14. console.log(response);
  1. PUT /cjk_bigram_example
  2. {
  3. "settings": {
  4. "analysis": {
  5. "analyzer": {
  6. "standard_cjk_bigram": {
  7. "tokenizer": "standard",
  8. "filter": [ "cjk_bigram" ]
  9. }
  10. }
  11. }
  12. }
  13. }

Configurable parameters

ignored_scripts

(Optional, array of character scripts) Array of character scripts for which to disable bigrams. Possible values:

  • han
  • hangul
  • hiragana
  • katakana

All non-CJK input is passed through unmodified.

output_unigrams

(Optional, Boolean) If true, emit tokens in both bigram and unigram form. If false, a CJK character is output in unigram form when it has no adjacent characters. Defaults to false.

Customize

To customize the CJK bigram token filter, duplicate it to create the basis for a new custom token filter. You can modify the filter using its configurable parameters.

  1. resp = client.indices.create(
  2. index="cjk_bigram_example",
  3. settings={
  4. "analysis": {
  5. "analyzer": {
  6. "han_bigrams": {
  7. "tokenizer": "standard",
  8. "filter": [
  9. "han_bigrams_filter"
  10. ]
  11. }
  12. },
  13. "filter": {
  14. "han_bigrams_filter": {
  15. "type": "cjk_bigram",
  16. "ignored_scripts": [
  17. "hangul",
  18. "hiragana",
  19. "katakana"
  20. ],
  21. "output_unigrams": True
  22. }
  23. }
  24. }
  25. },
  26. )
  27. print(resp)
  1. response = client.indices.create(
  2. index: 'cjk_bigram_example',
  3. body: {
  4. settings: {
  5. analysis: {
  6. analyzer: {
  7. han_bigrams: {
  8. tokenizer: 'standard',
  9. filter: [
  10. 'han_bigrams_filter'
  11. ]
  12. }
  13. },
  14. filter: {
  15. han_bigrams_filter: {
  16. type: 'cjk_bigram',
  17. ignored_scripts: [
  18. 'hangul',
  19. 'hiragana',
  20. 'katakana'
  21. ],
  22. output_unigrams: true
  23. }
  24. }
  25. }
  26. }
  27. }
  28. )
  29. puts response
  1. const response = await client.indices.create({
  2. index: "cjk_bigram_example",
  3. settings: {
  4. analysis: {
  5. analyzer: {
  6. han_bigrams: {
  7. tokenizer: "standard",
  8. filter: ["han_bigrams_filter"],
  9. },
  10. },
  11. filter: {
  12. han_bigrams_filter: {
  13. type: "cjk_bigram",
  14. ignored_scripts: ["hangul", "hiragana", "katakana"],
  15. output_unigrams: true,
  16. },
  17. },
  18. },
  19. },
  20. });
  21. console.log(response);
  1. PUT /cjk_bigram_example
  2. {
  3. "settings": {
  4. "analysis": {
  5. "analyzer": {
  6. "han_bigrams": {
  7. "tokenizer": "standard",
  8. "filter": [ "han_bigrams_filter" ]
  9. }
  10. },
  11. "filter": {
  12. "han_bigrams_filter": {
  13. "type": "cjk_bigram",
  14. "ignored_scripts": [
  15. "hangul",
  16. "hiragana",
  17. "katakana"
  18. ],
  19. "output_unigrams": true
  20. }
  21. }
  22. }
  23. }
  24. }