Author: hardy.ferentschik
Date: 2008-11-11 15:16:10 -0500 (Tue, 11 Nov 2008)
New Revision: 15549
Added:
search/trunk/src/test-resources/org/hibernate/search/test/analyzer/solr/synonyms.properties
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilterFactory.java
Modified:
search/trunk/ivy.xml
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/Team.java
Log:
HSEARCH-255 Added a whole bunch of tests for different tokenizers and filters
Modified: search/trunk/ivy.xml
===================================================================
--- search/trunk/ivy.xml 2008-11-11 13:32:00 UTC (rev 15548)
+++ search/trunk/ivy.xml 2008-11-11 20:16:10 UTC (rev 15549)
@@ -21,10 +21,12 @@
<dependency org="javax.transaction" name="jta"
rev="1.1" conf="default->default"/>
<dependency org="org.apache.lucene" name="lucene-core"
rev="2.4.0" conf="default->default"/>
<dependency org="org.apache.lucene" name="lucene-snowball"
rev="2.4.0" conf="default->default"/>
+ <dependency org="org.apache.lucene"
name="lucene-analyzers" rev="2.4.0"
conf="default->default"/> <!-- optional -->
+ <dependency org="org.apache.commons" name="commons-codec"
rev="1.3" conf="default->default"/> <!-- optional -->
<dependency org="javax.jms" name="jms" rev="1.1"
conf="default->default"/> <!-- optional -->
<dependency org="javax.annotation" name="jsr250-api"
rev="1.0" conf="default->default"/> <!-- optional -->
- <dependency org="org.apache.solr" name="solr-core"
rev="1.3.0" conf="default->default"/>
- <dependency org="org.apache.solr" name="solr-common"
rev="1.3.0" conf="default->default"/>
+ <dependency org="org.apache.solr" name="solr-core"
rev="1.3.0" conf="default->default"/> <!-- optional -->
+ <dependency org="org.apache.solr" name="solr-common"
rev="1.3.0" conf="default->default"/> <!-- optional -->
<!-- transitive dependencies -->
<dependency org="antlr" name="antlr" rev="2.7.6"
conf="test->default"/>
@@ -46,4 +48,4 @@
<dependency org="org.apache.derby" name="derby"
rev="10.2.1.6" conf="test->default"/>
</dependencies>
-</ivy-module>
\ No newline at end of file
+</ivy-module>
Added:
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java
===================================================================
---
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java
(rev 0)
+++
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java 2008-11-11
20:16:10 UTC (rev 15549)
@@ -0,0 +1,36 @@
+// $Id: AbstractTestAnalyzer.java 15547 2008-11-11 12:57:47Z hardy.ferentschik $
+package org.hibernate.search.test.analyzer.solr;
+
+import java.io.Reader;
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.search.Filter;
+
+
+/**
+ * A filter which will actually insert spaces. Most filters/tokenizers remove them, but
for testing it is
+ * sometimes better to insert them again ;-)
+ *
+ * @author Hardy Ferentschik
+ */
+public class InsertWhitespaceFilter extends TokenFilter {
+ public InsertWhitespaceFilter(TokenStream in) {
+ super( in );
+ }
+
+ public Token next(final Token reusableToken) throws IOException {
+ Token nextToken = input.next( reusableToken );
+ if ( nextToken != null ) {
+ nextToken.setTermBuffer( " " + nextToken.term() + " " );
+ return nextToken;
+ }
+ else {
+ return null;
+ }
+ }
+}
\ No newline at end of file
Added:
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilterFactory.java
===================================================================
---
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilterFactory.java
(rev 0)
+++
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilterFactory.java 2008-11-11
20:16:10 UTC (rev 15549)
@@ -0,0 +1,30 @@
+// $Id$
+/*
+* JBoss, Home of Professional Open Source
+* Copyright 2008, Red Hat Middleware LLC, and individual contributors
+* by the @authors tag. See the copyright.txt in the distribution for a
+* full listing of individual contributors.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
http://www.apache.org/licenses/LICENSE-2.0
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.hibernate.search.test.analyzer.solr;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.analysis.BaseTokenFilterFactory;
+
+/**
+ * @author Hardy Ferentschik
+ */
+public class InsertWhitespaceFilterFactory extends BaseTokenFilterFactory {
+ public InsertWhitespaceFilter create(TokenStream input) {
+ return new InsertWhitespaceFilter( input );
+ }
+}
Property changes on:
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilterFactory.java
___________________________________________________________________
Name: svn:keywords
+ Id
Modified:
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java
===================================================================
---
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java 2008-11-11
13:32:00 UTC (rev 15548)
+++
search/trunk/src/test/org/hibernate/search/test/analyzer/solr/SolrAnalyzerTest.java 2008-11-11
20:16:10 UTC (rev 15549)
@@ -3,11 +3,14 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
import org.hibernate.Transaction;
import org.hibernate.search.FullTextSession;
import org.hibernate.search.Search;
import org.hibernate.search.test.SearchTestCase;
+import org.hibernate.search.test.util.AnalyzerUtils;
/**
* Tests the Solr analyzer creation framework.
@@ -62,6 +65,90 @@
fts.close();
}
+ /**
+ * Tests the analyzers defined on {@link Team}.
+ *
+ * @throws Exception in case the test fails.
+ */
+ public void testAnalyzers() throws Exception {
+ FullTextSession fts = Search.getFullTextSession( openSession() );
+
+ Analyzer analyzer = fts.getSearchFactory().getAnalyzer( "standard_analyzer"
);
+ String text = "This is just FOOBAR's";
+ Token[] tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "This",
"is", "just", "FOOBAR" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "html_standard_analyzer" );
+ text = "This is <b>foo</b><i>bar's</i>";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "This",
"is", "foo", "bar" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "html_whitespace_analyzer" );
+ text = "This is <b>foo</b><i>bar's</i>";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "This",
"is", "foo", "bar's" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "trim_analyzer" );
+ text = " Kittens! ";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "kittens" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "length_analyzer" );
+ text = "ab abc abcd abcde abcdef";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "abc",
"abcd", "abcde" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "length_analyzer" );
+ text = "ab abc abcd abcde abcdef";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "abc",
"abcd", "abcde" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "porter_analyzer" );
+ text = "bikes bikes biking";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "bike",
"bike", "bike" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "word_analyzer" );
+ text = "CamelCase";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "Camel",
"Case" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "synonym_analyzer" );
+ text = "ipod cosmos";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual( tokens, new String[] { "ipod",
"i-pod", "universe", "cosmos" } );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "shingle_analyzer" );
+ text = "please divide this sentence into shingles";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual(
+ tokens,
+ new String[] {
+ "please",
+ "please divide",
+ "divide",
+ "divide this",
+ "this",
+ "this sentence",
+ "sentence",
+ "sentence into",
+ "into",
+ "into shingles",
+ "shingles"
+ }
+ );
+
+ analyzer = fts.getSearchFactory().getAnalyzer( "phonetic_analyzer" );
+ text = "The quick brown fox jumped over the lazy dogs";
+ tokens = AnalyzerUtils.tokensFromAnalysis( analyzer, "name", text );
+ AnalyzerUtils.displayTokens( analyzer, "name", text );
+ AnalyzerUtils.assertTokensEqual(
+ tokens, new String[] { "0", "KK", "BRN",
"FKS", "JMPT", "OFR", "0", "LS",
"TKS" }
+ );
+
+ fts.close();
+ }
+
protected Class[] getMappings() {
return new Class[] {
Team.class
Modified: search/trunk/src/test/org/hibernate/search/test/analyzer/solr/Team.java
===================================================================
--- search/trunk/src/test/org/hibernate/search/test/analyzer/solr/Team.java 2008-11-11
13:32:00 UTC (rev 15548)
+++ search/trunk/src/test/org/hibernate/search/test/analyzer/solr/Team.java 2008-11-11
20:16:10 UTC (rev 15549)
@@ -5,14 +5,26 @@
import javax.persistence.GeneratedValue;
import javax.persistence.Id;
+import org.apache.solr.analysis.HTMLStripStandardTokenizerFactory;
+import org.apache.solr.analysis.HTMLStripWhitespaceTokenizerFactory;
import org.apache.solr.analysis.ISOLatin1AccentFilterFactory;
+import org.apache.solr.analysis.LengthFilterFactory;
import org.apache.solr.analysis.LowerCaseFilterFactory;
+import org.apache.solr.analysis.LowerCaseTokenizerFactory;
+import org.apache.solr.analysis.PorterStemFilterFactory;
+import org.apache.solr.analysis.ShingleFilterFactory;
import org.apache.solr.analysis.SnowballPorterFilterFactory;
+import org.apache.solr.analysis.StandardFilterFactory;
import org.apache.solr.analysis.StandardTokenizerFactory;
import org.apache.solr.analysis.StopFilterFactory;
+import org.apache.solr.analysis.SynonymFilterFactory;
+import org.apache.solr.analysis.TrimFilterFactory;
+import org.apache.solr.analysis.WordDelimiterFilterFactory;
+import org.apache.solr.analysis.PhoneticFilterFactory;
import org.hibernate.search.annotations.Analyzer;
import org.hibernate.search.annotations.AnalyzerDef;
+import org.hibernate.search.annotations.AnalyzerDefs;
import org.hibernate.search.annotations.DocumentId;
import org.hibernate.search.annotations.Field;
import org.hibernate.search.annotations.Indexed;
@@ -25,20 +37,94 @@
*/
@Entity
@Indexed
-@AnalyzerDef(name = "customanalyzer",
- tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
- filters = {
- @TokenFilterDef(factory = ISOLatin1AccentFilterFactory.class),
- @TokenFilterDef(factory = LowerCaseFilterFactory.class),
- @TokenFilterDef(factory = StopFilterFactory.class, params = {
- @Parameter(name = "words",
- value =
"org/hibernate/search/test/analyzer/solr/stoplist.properties"),
- @Parameter(name = "ignoreCase", value = "true")
+@AnalyzerDefs({
+ @AnalyzerDef(name = "customanalyzer",
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = ISOLatin1AccentFilterFactory.class),
+ @TokenFilterDef(factory = LowerCaseFilterFactory.class),
+ @TokenFilterDef(factory = StopFilterFactory.class, params = {
+ @Parameter(name = "words",
+ value =
"org/hibernate/search/test/analyzer/solr/stoplist.properties"),
+ @Parameter(name = "ignoreCase", value = "true")
+ }),
+ @TokenFilterDef(factory = SnowballPorterFilterFactory.class, params = {
+ @Parameter(name = "language", value = "English")
+ })
}),
- @TokenFilterDef(factory = SnowballPorterFilterFactory.class, params = {
- @Parameter(name = "language", value = "English")
+
+ @AnalyzerDef(name = "standard_analyzer",
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = StandardFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "html_standard_analyzer",
+ tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = StandardFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "html_whitespace_analyzer",
+ tokenizer = @TokenizerDef(factory = HTMLStripWhitespaceTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = StandardFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "trim_analyzer",
+ tokenizer = @TokenizerDef(factory = LowerCaseTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = InsertWhitespaceFilterFactory.class),
+ @TokenFilterDef(factory = TrimFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "length_analyzer",
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = LengthFilterFactory.class, params = {
+ @Parameter(name = "min", value = "3"),
+ @Parameter(name = "max", value = "5")
+ })
+ }),
+
+ @AnalyzerDef(name = "porter_analyzer",
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = PorterStemFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "word_analyzer",
+ tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = WordDelimiterFilterFactory.class, params = {
+ @Parameter(name = "splitOnCaseChange", value = "1")
+ })
+ }),
+
+ @AnalyzerDef(name = "synonym_analyzer",
+ tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = SynonymFilterFactory.class, params = {
+ @Parameter(name = "synonyms",
+ value =
"org/hibernate/search/test/analyzer/solr/synonyms.properties")
+ })
+ }),
+
+ @AnalyzerDef(name = "shingle_analyzer",
+ tokenizer = @TokenizerDef(factory = HTMLStripStandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = ShingleFilterFactory.class)
+ }),
+
+ @AnalyzerDef(name = "phonetic_analyzer",
+ tokenizer = @TokenizerDef(factory = StandardTokenizerFactory.class),
+ filters = {
+ @TokenFilterDef(factory = PhoneticFilterFactory.class, params = {
+ @Parameter(name = "encoder", value = "Metaphone"),
+ @Parameter(name = "inject", value = "true")
+ })
})
- })
+})
public class Team {
@Id
@DocumentId
Added:
search/trunk/src/test-resources/org/hibernate/search/test/analyzer/solr/synonyms.properties
===================================================================
---
search/trunk/src/test-resources/org/hibernate/search/test/analyzer/solr/synonyms.properties
(rev 0)
+++
search/trunk/src/test-resources/org/hibernate/search/test/analyzer/solr/synonyms.properties 2008-11-11
20:16:10 UTC (rev 15549)
@@ -0,0 +1,2 @@
+ipod, i-pod
+universe , cosmos