scm-webapp/src/main/java/sonia/scm/search/NonNaturalLanguageAnalyzer.java

/*
 * MIT License
 *
 * Copyright (c) 2020-present Cloudogu GmbH and Contributors
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package sonia.scm.search;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;
import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;

public class NonNaturalLanguageAnalyzer extends Analyzer {

  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
    StandardTokenizer tokenizer = new StandardTokenizer();
    TokenStream stream = new WordDelimiterGraphFilter(
      tokenizer,
      GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | PRESERVE_ORIGINAL,
      null // list of words which are protected from being delimited
    );
    stream = new LowerCaseFilter(stream);
    return new TokenStreamComponents(tokenizer, stream);
  }
}
Refactor Search API and allow analyzer per field (#1755) The Search api is now simpler, because it provides useful defaults. Only if you want to deviate from the defaults, you can set these values. This is mostly reached by using the builder pattern. Furthermore it is now possible to configure an analyzer per field. The default analyzer is still the one which is derived from the index options, but it is possible to configure a new indexer with the analyzer attribute of the indexed annotation. The attribute allows the configuration for code, identifiers and path. The current implementation uses the same analyzer code, identifiers and path. The new implemented splits tokens on more delimiters as the default analyzer e.g.: dots, underscores etc. Co-authored-by: René Pfeuffer <rene.pfeuffer@cloudogu.com> 2021-08-05 08:21:46 +02:00			`/*`
			`* MIT License`
			`*`
			`* Copyright (c) 2020-present Cloudogu GmbH and Contributors`
			`*`
			`* Permission is hereby granted, free of charge, to any person obtaining a copy`
			`* of this software and associated documentation files (the "Software"), to deal`
			`* in the Software without restriction, including without limitation the rights`
			`* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell`
			`* copies of the Software, and to permit persons to whom the Software is`
			`* furnished to do so, subject to the following conditions:`
			`*`
			`* The above copyright notice and this permission notice shall be included in all`
			`* copies or substantial portions of the Software.`
			`*`
			`* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR`
			`* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,`
			`* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE`
			`* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER`
			`* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,`
			`* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE`
			`* SOFTWARE.`
			`*/`

			`package sonia.scm.search;`

			`import org.apache.lucene.analysis.Analyzer;`
			`import org.apache.lucene.analysis.LowerCaseFilter;`
			`import org.apache.lucene.analysis.TokenStream;`
			`import org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter;`
			`import org.apache.lucene.analysis.standard.StandardTokenizer;`

			`import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS;`
			`import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.GENERATE_WORD_PARTS;`
			`import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.PRESERVE_ORIGINAL;`
			`import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE;`
			`import static org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter.SPLIT_ON_NUMERICS;`

			`public class NonNaturalLanguageAnalyzer extends Analyzer {`

			`@Override`
			`protected TokenStreamComponents createComponents(String fieldName) {`
			`StandardTokenizer tokenizer = new StandardTokenizer();`
			`TokenStream stream = new WordDelimiterGraphFilter(`
			`tokenizer,`
			`GENERATE_WORD_PARTS \| GENERATE_NUMBER_PARTS \| SPLIT_ON_CASE_CHANGE \| SPLIT_ON_NUMERICS \| PRESERVE_ORIGINAL,`
			`null // list of words which are protected from being delimited`
			`);`
			`stream = new LowerCaseFilter(stream);`
			`return new TokenStreamComponents(tokenizer, stream);`
			`}`
			`}`