CJK bigram token filter
CJK bigram token filter
Forms bigrams out of CJK (Chinese, Japanese, and Korean) tokens.
This filter is included in Elasticsearch’s built-in CJK language analyzer. It uses Lucene’s CJKBigramFilter.
Example
The following analyze API request demonstrates how the CJK bigram token filter works.
resp = client.indices.analyze(
tokenizer="standard",
filter=[
"cjk_bigram"
],
text="東京都は、日本の首都であり",
)
print(resp)
response = client.indices.analyze(
body: {
tokenizer: 'standard',
filter: [
'cjk_bigram'
],
text: '東京都は、日本の首都であり'
}
)
puts response
const response = await client.indices.analyze({
tokenizer: "standard",
filter: ["cjk_bigram"],
text: "東京都は、日本の首都であり",
});
console.log(response);
GET /_analyze
{
"tokenizer" : "standard",
"filter" : ["cjk_bigram"],
"text" : "東京都は、日本の首都であり"
}
The filter produces the following tokens:
[ 東京, 京都, 都は, 日本, 本の, の首, 首都, 都で, であ, あり ]
Add to an analyzer
The following create index API request uses the CJK bigram token filter to configure a new custom analyzer.
resp = client.indices.create(
index="cjk_bigram_example",
settings={
"analysis": {
"analyzer": {
"standard_cjk_bigram": {
"tokenizer": "standard",
"filter": [
"cjk_bigram"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'cjk_bigram_example',
body: {
settings: {
analysis: {
analyzer: {
standard_cjk_bigram: {
tokenizer: 'standard',
filter: [
'cjk_bigram'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "cjk_bigram_example",
settings: {
analysis: {
analyzer: {
standard_cjk_bigram: {
tokenizer: "standard",
filter: ["cjk_bigram"],
},
},
},
},
});
console.log(response);
PUT /cjk_bigram_example
{
"settings": {
"analysis": {
"analyzer": {
"standard_cjk_bigram": {
"tokenizer": "standard",
"filter": [ "cjk_bigram" ]
}
}
}
}
}
Configurable parameters
ignored_scripts
(Optional, array of character scripts) Array of character scripts for which to disable bigrams. Possible values:
han
hangul
hiragana
katakana
All non-CJK input is passed through unmodified.
output_unigrams
(Optional, Boolean) If true
, emit tokens in both bigram and unigram form. If false
, a CJK character is output in unigram form when it has no adjacent characters. Defaults to false
.
Customize
To customize the CJK bigram token filter, duplicate it to create the basis for a new custom token filter. You can modify the filter using its configurable parameters.
resp = client.indices.create(
index="cjk_bigram_example",
settings={
"analysis": {
"analyzer": {
"han_bigrams": {
"tokenizer": "standard",
"filter": [
"han_bigrams_filter"
]
}
},
"filter": {
"han_bigrams_filter": {
"type": "cjk_bigram",
"ignored_scripts": [
"hangul",
"hiragana",
"katakana"
],
"output_unigrams": True
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'cjk_bigram_example',
body: {
settings: {
analysis: {
analyzer: {
han_bigrams: {
tokenizer: 'standard',
filter: [
'han_bigrams_filter'
]
}
},
filter: {
han_bigrams_filter: {
type: 'cjk_bigram',
ignored_scripts: [
'hangul',
'hiragana',
'katakana'
],
output_unigrams: true
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "cjk_bigram_example",
settings: {
analysis: {
analyzer: {
han_bigrams: {
tokenizer: "standard",
filter: ["han_bigrams_filter"],
},
},
filter: {
han_bigrams_filter: {
type: "cjk_bigram",
ignored_scripts: ["hangul", "hiragana", "katakana"],
output_unigrams: true,
},
},
},
},
});
console.log(response);
PUT /cjk_bigram_example
{
"settings": {
"analysis": {
"analyzer": {
"han_bigrams": {
"tokenizer": "standard",
"filter": [ "han_bigrams_filter" ]
}
},
"filter": {
"han_bigrams_filter": {
"type": "cjk_bigram",
"ignored_scripts": [
"hangul",
"hiragana",
"katakana"
],
"output_unigrams": true
}
}
}
}
}