Pattern analyzer
- Pattern analyzer

Pattern analyzer

The pattern analyzer uses a regular expression to split the text into terms. The regular expression should match the token separators not the tokens themselves. The regular expression defaults to \W+ (or all non-word characters).

Beware of Pathological Regular Expressions

The pattern analyzer uses Java Regular Expressions.

A badly written regular expression could run very slowly or even throw a StackOverflowError and cause the node it is running on to exit suddenly.

Example output

resp = client.indices.analyze(
    analyzer="pattern",
    text="The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
)
print(resp)

response = client.indices.analyze(
  body: {
    analyzer: 'pattern',
    text: "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
  }
)
puts response

const response = await client.indices.analyze({
  analyzer: "pattern",
  text: "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone.",
});
console.log(response);

POST _analyze
{
  "analyzer": "pattern",
  "text": "The 2 QUICK Brown-Foxes jumped over the lazy dog's bone."
}

The above sentence would produce the following terms:

[ the, 2, quick, brown, foxes, jumped, over, the, lazy, dog, s, bone ]

Configuration

The pattern analyzer accepts the following parameters:

`pattern`	A Java regular expression, defaults to `\W+`.
`flags`	Java regular expression flags. Flags should be pipe-separated, eg `“CASEINSENSITIVE\|COMMENTS”`.
`lowercase`	Should terms be lowercased or not. Defaults to `true`.
`stopwords`	A pre-defined stop words list like `_english` or an array containing a list of stop words. Defaults to `none`.
`stopwords_path`	The path to a file containing stop words.

See the Stop Token Filter for more information about stop word configuration.

Example configuration

In this example, we configure the pattern analyzer to split email addresses on non-word characters or on underscores (\W|_), and to lower-case the result:

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "my_email_analyzer": {
                    "type": "pattern",
                    "pattern": "\\W|_",
                    "lowercase": True
                }
            }
        }
    },
)
print(resp)
resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="my_email_analyzer",
    text="John_Smith@foo-bar.com",
)
print(resp1)

response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          my_email_analyzer: {
            type: 'pattern',
            pattern: '\\W|_',
            lowercase: true
          }
        }
      }
    }
  }
)
puts response
response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'my_email_analyzer',
    text: 'John_Smith@foo-bar.com'
  }
)
puts response

const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        my_email_analyzer: {
          type: "pattern",
          pattern: "\\W|_",
          lowercase: true,
        },
      },
    },
  },
});
console.log(response);
const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "my_email_analyzer",
  text: "John_Smith@foo-bar.com",
});
console.log(response1);

PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "my_email_analyzer": {
          "type":      "pattern",
          "pattern":   "\\W|_", 
          "lowercase": true
        }
      }
    }
  }
}
POST my-index-000001/_analyze
{
  "analyzer": "my_email_analyzer",
  "text": "John_Smith@foo-bar.com"
}

The backslashes in the pattern need to be escaped when specifying the pattern as a JSON string.

The above example produces the following terms:

[ john, smith, foo, bar, com ]

CamelCase tokenizer

The following more complicated example splits CamelCase text into tokens:

resp = client.indices.create(
    index="my-index-000001",
    settings={
        "analysis": {
            "analyzer": {
                "camel": {
                    "type": "pattern",
                    "pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
                }
            }
        }
    },
)
print(resp)
resp1 = client.indices.analyze(
    index="my-index-000001",
    analyzer="camel",
    text="MooseX::FTPClass2_beta",
)
print(resp1)

response = client.indices.create(
  index: 'my-index-000001',
  body: {
    settings: {
      analysis: {
        analyzer: {
          camel: {
            type: 'pattern',
            pattern: '([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])'
          }
        }
      }
    }
  }
)
puts response
response = client.indices.analyze(
  index: 'my-index-000001',
  body: {
    analyzer: 'camel',
    text: 'MooseX::FTPClass2_beta'
  }
)
puts response

const response = await client.indices.create({
  index: "my-index-000001",
  settings: {
    analysis: {
      analyzer: {
        camel: {
          type: "pattern",
          pattern:
            "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])",
        },
      },
    },
  },
});
console.log(response);
const response1 = await client.indices.analyze({
  index: "my-index-000001",
  analyzer: "camel",
  text: "MooseX::FTPClass2_beta",
});
console.log(response1);

PUT my-index-000001
{
  "settings": {
    "analysis": {
      "analyzer": {
        "camel": {
          "type": "pattern",
          "pattern": "([^\\p{L}\\d]+)|(?<=\\D)(?=\\d)|(?<=\\d)(?=\\D)|(?<=[\\p{L}&&[^\\p{Lu}]])(?=\\p{Lu})|(?<=\\p{Lu})(?=\\p{Lu}[\\p{L}&&[^\\p{Lu}]])"
        }
      }
    }
  }
}
GET my-index-000001/_analyze
{
  "analyzer": "camel",
  "text": "MooseX::FTPClass2_beta"
}

The above example produces the following terms:

[ moose, x, ftp, class, 2, beta ]

The regex above is easier to understand as:

  ([^\p{L}\d]+)                 # swallow non letters and numbers,
| (?<=\D)(?=\d)                 # or non-number followed by number,
| (?<=\d)(?=\D)                 # or number followed by non-number,
| (?<=[ \p{L} && [^\p{Lu}]])    # or lower case
  (?=\p{Lu})                    #   followed by upper case,
| (?<=\p{Lu})                   # or upper case
  (?=\p{Lu}                     #   followed by upper case
    [\p{L}&&[^\p{Lu}]]          #   then lower case
  )

Definition

The pattern analyzer consists of:

Tokenizer

Pattern Tokenizer

Token Filters

Lower Case Token Filter
Stop Token Filter (disabled by default)

If you need to customize the pattern analyzer beyond the configuration parameters then you need to recreate it as a custom analyzer and modify it, usually by adding token filters. This would recreate the built-in pattern analyzer and you can use it as a starting point for further customization:

resp = client.indices.create(
    index="pattern_example",
    settings={
        "analysis": {
            "tokenizer": {
                "split_on_non_word": {
                    "type": "pattern",
                    "pattern": "\\W+"
                }
            },
            "analyzer": {
                "rebuilt_pattern": {
                    "tokenizer": "split_on_non_word",
                    "filter": [
                        "lowercase"
                    ]
                }
            }
        }
    },
)
print(resp)

response = client.indices.create(
  index: 'pattern_example',
  body: {
    settings: {
      analysis: {
        tokenizer: {
          split_on_non_word: {
            type: 'pattern',
            pattern: '\\W+'
          }
        },
        analyzer: {
          rebuilt_pattern: {
            tokenizer: 'split_on_non_word',
            filter: [
              'lowercase'
            ]
          }
        }
      }
    }
  }
)
puts response

const response = await client.indices.create({
  index: "pattern_example",
  settings: {
    analysis: {
      tokenizer: {
        split_on_non_word: {
          type: "pattern",
          pattern: "\\W+",
        },
      },
      analyzer: {
        rebuilt_pattern: {
          tokenizer: "split_on_non_word",
          filter: ["lowercase"],
        },
      },
    },
  },
});
console.log(response);

PUT /pattern_example
{
  "settings": {
    "analysis": {
      "tokenizer": {
        "split_on_non_word": {
          "type":       "pattern",
          "pattern":    "\\W+" 
        }
      },
      "analyzer": {
        "rebuilt_pattern": {
          "tokenizer": "split_on_non_word",
          "filter": [
            "lowercase"       
          ]
        }
      }
    }
  }
}

	The default pattern is `\W+` which splits on non-word characters and this is where you’d change it.
	You’d add other token filters after `lowercase`.