Pattern analyzer
Pattern analyzer
The pattern
analyzer uses a regular expression to split the text into terms. The regular expression should match the token separators not the tokens themselves. The regular expression defaults to \W+
(or all non-word characters).
Beware of Pathological Regular Expressions
The pattern analyzer uses Java Regular Expressions.
A badly written regular expression could run very slowly or even throw a StackOverflowError and cause the node it is running on to exit suddenly.
Read more about pathological regular expressions and how to avoid them.
Example output
resp = client.indices.analyze(
analyzer="pattern",
text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
print(resp)
response = client.indices.analyze(
body: {
analyzer: 'pattern',
text: "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
)
puts response
const response = await client.indices.analyze({
analyzer: "pattern",
text: "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
});
console.log(response);
POST _analyze
{
"analyzer": "pattern",
"text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}
The above sentence would produce the following terms:
[ the, 2, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]
Configuration
The pattern
analyzer accepts the following parameters:
| A Java regular expression, defaults to |
| Java regular expression flags. Flags should be pipe-separated, eg |
| Should terms be lowercased or not. Defaults to |
| A pre-defined stop words list like |
| The path to a file containing stop words. |
See the Stop Token Filter for more information about stop word configuration.
Example configuration
In this example, we configure the pattern
analyzer to split email addresses on non-word characters or on underscores (\W|_
), and to lower-case the result:
resp = client.indices.create(
index="my-index-000001",
settings={
"analysis": {
"analyzer": {
"my_email_analyzer": {
"type": "pattern",
"pattern": "\\W|_",
"lowercase": True
}
}
}
},
)
print(resp)
resp1 = client.indices.analyze(
index="my-index-000001",
analyzer="my_email_analyzer",
text="John_Smith@foo-bar.com",
)
print(resp1)
response = client.indices.create(
index: 'my-index-000001',
body: {
settings: {
analysis: {
analyzer: {
my_email_analyzer: {
type: 'pattern',
pattern: '\\W|_',
lowercase: true
}
}
}
}
}
)
puts response
response = client.indices.analyze(
index: 'my-index-000001',
body: {
analyzer: 'my_email_analyzer',
text: 'John_Smith@foo-bar.com'
}
)
puts response
const response = await client.indices.create({
index: "my-index-000001",
settings: {
analysis: {
analyzer: {
my_email_analyzer: {
type: "pattern",
pattern: "\\W|_",
lowercase: true,
},
},
},
},
});
console.log(response);
const response1 = await client.indices.analyze({
index: "my-index-000001",
analyzer: "my_email_analyzer",
text: "John_Smith@foo-bar.com",
});
console.log(response1);
PUT my-index-000001
{
"settings": {
"analysis": {
"analyzer": {
"my_email_analyzer": {
"type": "pattern",
"pattern": "\\W|_",
"lowercase": true
}
}
}
}
}
POST my-index-000001/_analyze
{
"analyzer": "my_email_analyzer",
"text": "John_Smith@foo-bar.com"
}
The backslashes in the pattern need to be escaped when specifying the pattern as a JSON string. |
The above example produces the following terms:
[ john, smith, foo, bar, com ]
CamelCase tokenizer
The following more complicated example splits CamelCase text into tokens:
resp = client.indices.create(
index="my-index-000001",
settings={
"analysis": {
"analyzer": {
"camel": {
"type": "pattern",
"pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
}
}
}
},
)
print(resp)
resp1 = client.indices.analyze(
index="my-index-000001",
analyzer="camel",
text="MooseX::FTPClass2_beta",
)
print(resp1)
response = client.indices.create(
index: 'my-index-000001',
body: {
settings: {
analysis: {
analyzer: {
camel: {
type: 'pattern',
pattern: '([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])'
}
}
}
}
}
)
puts response
response = client.indices.analyze(
index: 'my-index-000001',
body: {
analyzer: 'camel',
text: 'MooseX::FTPClass2_beta'
}
)
puts response
const response = await client.indices.create({
index: "my-index-000001",
settings: {
analysis: {
analyzer: {
camel: {
type: "pattern",
pattern:
"([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])",
},
},
},
},
});
console.log(response);
const response1 = await client.indices.analyze({
index: "my-index-000001",
analyzer: "camel",
text: "MooseX::FTPClass2_beta",
});
console.log(response1);
PUT my-index-000001
{
"settings": {
"analysis": {
"analyzer": {
"camel": {
"type": "pattern",
"pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
}
}
}
}
}
GET my-index-000001/_analyze
{
"analyzer": "camel",
"text": "MooseX::FTPClass2_beta"
}
The above example produces the following terms:
[ moose, x, ftp, class, 2, beta ]
The regex above is easier to understand as:
([^\p{L}\d]+) # swallow non letters and numbers,
| (?<=\D)(?=\d) # or non-number followed by number,
| (?<=\d)(?=\D) # or number followed by non-number,
| (?<=[ \p{L} && [^\p{Lu}]]) # or lower case
(?=\p{Lu}) # followed by upper case,
| (?<=\p{Lu}) # or upper case
(?=\p{Lu} # followed by upper case
[\p{L}&&[^\p{Lu}]] # then lower case
)
Definition
The pattern
analyzer consists of:
Tokenizer
Token Filters
- Lower Case Token Filter
- Stop Token Filter (disabled by default)
If you need to customize the pattern
analyzer beyond the configuration parameters then you need to recreate it as a custom
analyzer and modify it, usually by adding token filters. This would recreate the built-in pattern
analyzer and you can use it as a starting point for further customization:
resp = client.indices.create(
index="pattern_example",
settings={
"analysis": {
"tokenizer": {
"split_on_non_word": {
"type": "pattern",
"pattern": "\\W+"
}
},
"analyzer": {
"rebuilt_pattern": {
"tokenizer": "split_on_non_word",
"filter": [
"lowercase"
]
}
}
}
},
)
print(resp)
response = client.indices.create(
index: 'pattern_example',
body: {
settings: {
analysis: {
tokenizer: {
split_on_non_word: {
type: 'pattern',
pattern: '\\W+'
}
},
analyzer: {
rebuilt_pattern: {
tokenizer: 'split_on_non_word',
filter: [
'lowercase'
]
}
}
}
}
}
)
puts response
const response = await client.indices.create({
index: "pattern_example",
settings: {
analysis: {
tokenizer: {
split_on_non_word: {
type: "pattern",
pattern: "\\W+",
},
},
analyzer: {
rebuilt_pattern: {
tokenizer: "split_on_non_word",
filter: ["lowercase"],
},
},
},
},
});
console.log(response);
PUT /pattern_example
{
"settings": {
"analysis": {
"tokenizer": {
"split_on_non_word": {
"type": "pattern",
"pattern": "\\W+"
}
},
"analyzer": {
"rebuilt_pattern": {
"tokenizer": "split_on_non_word",
"filter": [
"lowercase"
]
}
}
}
}
}
The default pattern is | |
You’d add other token filters after |