Author: hardy.ferentschik
Date: 2010-09-17 05:15:09 -0400 (Fri, 17 Sep 2010)
New Revision: 20659
Added:
search/trunk/hibernate-search-solr-analyzers/
search/trunk/hibernate-search-solr-analyzers/pom.xml
search/trunk/hibernate-search-solr-analyzers/src/
search/trunk/hibernate-search-solr-analyzers/src/main/
search/trunk/hibernate-search-solr-analyzers/src/main/java/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseCharFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenStreamFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BufferedTokenStream.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CJKTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CapitalizationFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CharFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DutchStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ElisionFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/FrenchStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GermanStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripReader.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeywordTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LengthFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LetterTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/MappingCharFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizer.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PorterStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PositionFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReverseStringFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianStemFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ShingleFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SolrAnalyzer.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StopFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymMap.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ThaiWordFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerChain.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilter.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterIterator.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/ResourceLoader.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/SolrException.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/util/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/util/StrUtils.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/ArraysUtils.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/CharArrayMap.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/Constants.java
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/plugin/
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/plugin/ResourceLoaderAware.java
Modified:
search/trunk/hibernate-search-integrationtest/pom.xml
search/trunk/hibernate-search-testing/pom.xml
search/trunk/hibernate-search/pom.xml
search/trunk/hibernate-search/src/main/java/org/hibernate/search/Environment.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/bridge/LuceneOptions.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/engine/DocumentBuilderContainedEntity.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ConfigContext.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ImmutableSearchFactory.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/DelegateNamedAnalyzer.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PassThroughAnalyzer.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PluginLoader.java
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/ScopedAnalyzer.java
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/DoubleAnalyzerTest.java
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/inheritance/ISOLatin1Analyzer.java
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/DSLTest.java
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/Month.java
search/trunk/pom.xml
Log:
HSEARCH-593 updated all solr analyzers, deleted obsolete solr classes, cleanup of Search
code
Modified: search/trunk/hibernate-search/pom.xml
===================================================================
--- search/trunk/hibernate-search/pom.xml 2010-09-17 09:09:44 UTC (rev 20658)
+++ search/trunk/hibernate-search/pom.xml 2010-09-17 09:15:09 UTC (rev 20659)
@@ -55,13 +55,9 @@
<artifactId>lucene-core</artifactId>
</dependency>
<dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-core</artifactId>
+ <groupId>org.hibernate</groupId>
+ <artifactId>hibernate-search-solr-analyzers</artifactId>
</dependency>
- <!--<dependency>-->
- <!--<groupId>org.apache.lucene</groupId>-->
- <!--<artifactId>lucene-snowball</artifactId>-->
- <!--</dependency>-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/Environment.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/Environment.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/Environment.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -28,7 +28,7 @@
* @author Hardy Ferentschik
*/
public final class Environment {
-
+
private Environment() {
}
@@ -133,4 +133,9 @@
* If set to {@code true} the search statistic will be gathered.
*/
public static final String GENERATE_STATS =
"hibernate.search.generate_statistics";
+
+ /**
+ * The Lucene match version parameter. Needed since Lucene 3.x
+ */
+ public static final String LUCENE_MATCH_VERSION =
"hibernate.search.lucene_version";
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/bridge/LuceneOptions.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/bridge/LuceneOptions.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/bridge/LuceneOptions.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -37,6 +37,8 @@
void addFieldToDocument(String name, String indexedString, Document document);
+ boolean isCompressed();
+
/**
* Might be removed in version 3.3 to better support Lucene 3
* which is missing COMPRESS Store Type.
@@ -62,6 +64,4 @@
* @deprecated likely to be removed in version 3.3, use #addFieldToDocument
*/
Float getBoost();
-
- boolean isCompressed();
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/engine/DocumentBuilderContainedEntity.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/engine/DocumentBuilderContainedEntity.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/engine/DocumentBuilderContainedEntity.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -255,8 +255,7 @@
}
else {
try {
- return ( Analyzer ) PluginLoader.instanceFromConstructor( Analyzer.class,
analyzerClass, Version.class,
- Version.LUCENE_30, "");
+ return PluginLoader.analyzerInstanceFromClass( analyzerClass,
context.getLuceneMatchVersion() );
}
catch ( ClassCastException e ) {
throw new SearchException(
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ConfigContext.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ConfigContext.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ConfigContext.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -52,9 +52,10 @@
* @author Emmanuel Bernard
* @author Hardy Ferentschik
*/
-public class ConfigContext {
+public final class ConfigContext {
private static final Logger log = LoggerFactory.make();
+ private static final Version DEFAULT_LUCENE_MATCH_VERSION = Version.LUCENE_30;
private final Map<String, AnalyzerDef> analyzerDefs = new HashMap<String,
AnalyzerDef>();
private final List<DelegateNamedAnalyzer> lazyAnalyzers = new
ArrayList<DelegateNamedAnalyzer>();
@@ -63,7 +64,10 @@
private final boolean solrPresent;
private final boolean jpaPresent;
+ private final Version luceneMatchVersion;
+
public ConfigContext(SearchConfiguration cfg) {
+ luceneMatchVersion = getLuceneMatchVersion( cfg );
defaultAnalyzer = initAnalyzer( cfg );
defaultSimilarity = initSimilarity( cfg );
solrPresent = isPresent( "org.apache.solr.analysis.TokenizerFactory" );
@@ -110,14 +114,7 @@
else {
analyzerClass = StandardAnalyzer.class;
}
- Analyzer defaultAnalyzer = PluginLoader.instanceFromConstructor(
- Analyzer.class,
- analyzerClass,
- Version.class,
- Version.LUCENE_30,
- "Lucene analyzer"
- );
- return defaultAnalyzer;
+ return PluginLoader.analyzerInstanceFromClass( analyzerClass, luceneMatchVersion );
}
/**
@@ -150,6 +147,10 @@
return defaultSimilarity;
}
+ public Version getLuceneMatchVersion() {
+ return luceneMatchVersion;
+ }
+
public Map<String, Analyzer> initLazyAnalyzers() {
Map<String, Analyzer> initializedAnalyzers = new HashMap<String, Analyzer>(
analyzerDefs.size() );
@@ -186,26 +187,50 @@
"Use of @AnalyzerDef while Solr is not present in the classpath. Add
apache-solr-analyzer.jar"
);
}
+
// SolrAnalyzerBuilder references Solr classes.
// InitContext should not (directly or indirectly) load a Solr class to avoid hard
dependency
// unless necessary
// the current mechanism (check Solr class presence and call SolrAnalyzerBuilder if
needed
// seems to be sufficient on Apple VM (derived from Sun's
- // TODO check on other VMs and be ready for a more reflexive approach
- return SolrAnalyzerBuilder.buildAnalyzer( analyzerDef );
+ return SolrAnalyzerBuilder.buildAnalyzer( analyzerDef, luceneMatchVersion );
}
public boolean isJpaPresent() {
return jpaPresent;
}
- private boolean isPresent(String classname) {
+ private boolean isPresent(String className) {
try {
- ReflectHelper.classForName( classname, ConfigContext.class );
+ ReflectHelper.classForName( className, ConfigContext.class );
return true;
}
catch ( Exception e ) {
return false;
}
}
+
+ private Version getLuceneMatchVersion(SearchConfiguration cfg) {
+ Version version;
+ String tmp = cfg.getProperty( Environment.LUCENE_MATCH_VERSION );
+ if ( StringHelper.isEmpty( tmp ) ) {
+ version = DEFAULT_LUCENE_MATCH_VERSION;
+ }
+ else {
+ try {
+ version = Version.valueOf( tmp );
+ }
+ catch ( IllegalArgumentException e ) {
+ StringBuilder msg = new StringBuilder( tmp );
+ msg.append( " is a invalid value for the Lucene match version. Possible values
are: " );
+ for ( Version v : Version.values() ) {
+ msg.append( v.toString() );
+ msg.append( ", " );
+ }
+ msg.delete( msg.lastIndexOf( "," ), msg.length() - 1 );
+ throw new SearchException( msg.toString() );
+ }
+ }
+ return version;
+ }
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ImmutableSearchFactory.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ImmutableSearchFactory.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/ImmutableSearchFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -127,7 +127,7 @@
this.readerProvider = cfg.readerProvider;
this.worker = cfg.worker;
this.statistics = new StatisticsImpl( this );
- String enableStats = configurationProperties.getProperty( Environment.JMX_ENABLED );
+ String enableStats = configurationProperties.getProperty( Environment.GENERATE_STATS
);
if ( "true".equalsIgnoreCase( enableStats ) ) {
statistics.setStatisticsEnabled( true );
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/impl/SolrAnalyzerBuilder.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -34,7 +34,6 @@
import org.apache.solr.analysis.TokenizerChain;
import org.apache.solr.analysis.TokenizerFactory;
import org.apache.solr.common.ResourceLoader;
-import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.hibernate.search.SearchException;
@@ -47,13 +46,15 @@
/**
* Instances of this class are used to build Lucene analyzers which are defined using the
solr <code>TokenFilterFactory</code>.
- * To make the dependency to the solr framework optional only this class has direct
dependecies to solr. Solr dependencies
+ * To make the dependency to the solr framework optional only this class has direct
dependencies to solr. Solr dependencies
* are not supposed to be used anywhere else (except the actual configuration of the
analyzers in the domain model).
*
* @author Emmanuel Bernard
* @author Hardy Ferentschik
*/
final class SolrAnalyzerBuilder {
+ private static final String SOLR_LUCENE_VERSION_PARAM = "luceneMatchVersion";
+
private SolrAnalyzerBuilder() {
}
@@ -61,13 +62,14 @@
* Builds a Lucene <code>Analyzer</code> from the specified
<code>AnalyzerDef</code> annotation.
*
* @param analyzerDef The <code>AnalyzerDef</code> annotation as found in
the annotated domain class.
+ * @param luceneMatchVersion The lucene version (required since Lucene 3.x)
*
* @return a Lucene <code>Analyzer</code>
*/
- public static Analyzer buildAnalyzer(AnalyzerDef analyzerDef) {
+ public static Analyzer buildAnalyzer(AnalyzerDef analyzerDef, Version
luceneMatchVersion) {
TokenizerDef token = analyzerDef.tokenizer();
TokenizerFactory tokenFactory = ( TokenizerFactory ) instantiate( token.factory() );
- tokenFactory.init( getMapOfParameters( token.params() ) );
+ tokenFactory.init( getMapOfParameters( token.params(), luceneMatchVersion ) );
final int length = analyzerDef.filters().length;
final int charLength = analyzerDef.charFilters().length;
@@ -77,7 +79,7 @@
for ( int index = 0; index < length; index++ ) {
TokenFilterDef filterDef = analyzerDef.filters()[index];
filters[index] = ( TokenFilterFactory ) instantiate( filterDef.factory() );
- filters[index].init( getMapOfParameters( filterDef.params() ) );
+ filters[index].init( getMapOfParameters( filterDef.params(), luceneMatchVersion ) );
if ( filters[index] instanceof ResourceLoaderAware ) {
( ( ResourceLoaderAware ) filters[index] ).inform( resourceLoader );
}
@@ -85,7 +87,7 @@
for ( int index = 0; index < charFilters.length; index++ ) {
CharFilterDef charFilterDef = analyzerDef.charFilters()[index];
charFilters[index] = ( CharFilterFactory ) instantiate( charFilterDef.factory() );
- charFilters[index].init( getMapOfParameters( charFilterDef.params() ) );
+ charFilters[index].init( getMapOfParameters( charFilterDef.params(),
luceneMatchVersion ) );
if ( charFilters[index] instanceof ResourceLoaderAware ) {
( ( ResourceLoaderAware ) charFilters[index] ).inform( resourceLoader );
}
@@ -103,17 +105,14 @@
catch ( InstantiationException e ) {
throw new SearchException( "Unable to instantiate class: " + clazz, e );
}
- catch ( Throwable e) {
- throw new SearchException( "foo");
- }
}
- private static Map<String, String> getMapOfParameters(Parameter[] params) {
+ private static Map<String, String> getMapOfParameters(Parameter[] params, Version
luceneMatchVersion) {
Map<String, String> mapOfParams = new HashMap<String, String>(
params.length );
for ( Parameter param : params ) {
mapOfParams.put( param.name(), param.value() );
}
- mapOfParams.put( IndexSchema.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_30.toString()
);
+ mapOfParams.put( SOLR_LUCENE_VERSION_PARAM, luceneMatchVersion.toString() );
return Collections.unmodifiableMap( mapOfParams );
}
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/DelegateNamedAnalyzer.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/DelegateNamedAnalyzer.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/DelegateNamedAnalyzer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,26 +1,25 @@
-/* $Id$
- *
+/*
* Hibernate, Relational Persistence for Idiomatic Java
- *
- * Copyright (c) 2009, Red Hat, Inc. and/or its affiliates or third-party contributors
as
- * indicated by the @author tags or express copyright attribution
- * statements applied by the authors. All third-party contributions are
- * distributed under license by Red Hat, Inc.
- *
- * This copyrighted material is made available to anyone wishing to use, modify,
- * copy, or redistribute it subject to the terms and conditions of the GNU
- * Lesser General Public License, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this distribution; if not, write to:
- * Free Software Foundation, Inc.
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02110-1301 USA
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
*/
package org.hibernate.search.util;
@@ -31,10 +30,10 @@
import org.apache.lucene.analysis.TokenStream;
/**
- * delegate to a named analyzer
- * delegated Analyzers are lazily configured
+ * Delegate to a named analyzer. Delegated Analyzers are lazily configured.
*
* @author Emmanuel Bernard
+ * @author Hardy Ferentschik
*/
public final class DelegateNamedAnalyzer extends Analyzer {
private String name;
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PassThroughAnalyzer.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PassThroughAnalyzer.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PassThroughAnalyzer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,3 +1,26 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
package org.hibernate.search.util;
import java.io.Reader;
@@ -5,13 +28,11 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.util.AttributeSource;
/**
* Analyzer that applies no operation whatsoever to the flux
* This is useful for queries operating on non tokenized fields.
- *
+ * <p/>
* TODO there is probably a way to make that much more efficient by
* reimplementing TokenStream to take the Reader and pass through the flux as a single
token
*
@@ -19,11 +40,9 @@
*/
public final class PassThroughAnalyzer extends Analyzer {
-
-
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
- return new PassThroughTokenizer(reader);
+ return new PassThroughTokenizer( reader );
}
private static class PassThroughTokenizer extends CharTokenizer {
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PluginLoader.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PluginLoader.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/PluginLoader.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,32 +1,34 @@
-/* $Id$
- *
+/*
* Hibernate, Relational Persistence for Idiomatic Java
- *
- * Copyright (c) 2009, Red Hat, Inc. and/or its affiliates or third-party contributors
as
- * indicated by the @author tags or express copyright attribution
- * statements applied by the authors. All third-party contributions are
- * distributed under license by Red Hat, Inc.
- *
- * This copyrighted material is made available to anyone wishing to use, modify,
- * copy, or redistribute it subject to the terms and conditions of the GNU
- * Lesser General Public License, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this distribution; if not, write to:
- * Free Software Foundation, Inc.
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02110-1301 USA
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
*/
package org.hibernate.search.util;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.Version;
+
import org.hibernate.annotations.common.util.ReflectHelper;
import org.hibernate.search.SearchException;
@@ -35,14 +37,18 @@
* or from a class type.
* Uses reflection and throws SearchException(s) with proper descriptions of the error,
* like the target class is missing a proper constructor, is an interface, is not
found...
- *
+ *
* @author Sanne Grinovero
+ * @author Hardy Ferentschik
*/
public class PluginLoader {
-
+
+ private PluginLoader() {
+ }
+
/**
* Creates an instance of a target class designed by fully qualified name
- *
+ *
* @param <T> matches the type of targetSuperType: defines the return type
* @param targetSuperType the return type of the function, the classNameToLoad will be
checked
* to be assignable to this type.
@@ -50,135 +56,170 @@
* @param caller the class of the caller, needed for classloading purposes
* @param componentDescription a meaningful description of the role the instance will
have,
* used to enrich error messages to describe the context of the error
+ *
* @return a new instance of classNameToLoad
+ *
* @throws SearchException wrapping other error types with a proper error message for
all kind of problems, like
- * classNotFound, missing proper constructor, wrong type, security errors.
+ * classNotFound, missing proper constructor, wrong type,
security errors.
*/
public static <T> T instanceFromName(Class<T> targetSuperType, String
classNameToLoad,
- Class<?> caller, String componentDescription) {
+ Class<?> caller, String componentDescription) {
final Class<?> clazzDef;
- try {
- clazzDef = ReflectHelper.classForName( classNameToLoad, caller );
- } catch (ClassNotFoundException e) {
- throw new SearchException( "Unable to find " + componentDescription +
- " implementation class: " + classNameToLoad, e );
- }
+ clazzDef = classForName( classNameToLoad, caller, componentDescription );
return instanceFromClass( targetSuperType, clazzDef, componentDescription );
}
-
+
/**
* Creates an instance of target class
+ *
* @param <T> the type of targetSuperType: defines the return type
* @param targetSuperType the created instance will be checked to be assignable to this
type
* @param classToLoad the class to be instantiated
* @param componentDescription a role name/description to contextualize error messages
+ *
* @return a new instance of classToLoad
+ *
* @throws SearchException wrapping other error types with a proper error message for
all kind of problems, like
- * missing proper constructor, wrong type, security errors.
+ * missing proper constructor, wrong type, security errors.
*/
@SuppressWarnings("unchecked")
public static <T> T instanceFromClass(Class<T> targetSuperType,
Class<?> classToLoad, String componentDescription) {
checkClassType( classToLoad, componentDescription );
- checkHasValidconstructor( classToLoad, componentDescription );
+ checkHasNoArgConstructor( classToLoad, componentDescription );
Object instance;
try {
- instance = classToLoad.newInstance();
+ instance = classToLoad.newInstance();
}
catch ( IllegalAccessException e ) {
throw new SearchException(
"Unable to instantiate " + componentDescription + " class: " +
classToLoad.getName() +
- ". Class or constructor is not accessible.", e );
+ ". Class or constructor is not accessible.", e
+ );
}
catch ( InstantiationException e ) {
throw new SearchException(
"Unable to instantiate " + componentDescription + " class: " +
classToLoad.getName() +
- ". Verify it has a no-args public constructor and is not abstract.", e );
+ ". Verify it has a no-args public constructor and is not abstract.", e
+ );
}
- if ( ! targetSuperType.isInstance( instance ) ) {
+ if ( !targetSuperType.isInstance( instance ) ) {
// have a proper error message according to interface implementation or subclassing
if ( targetSuperType.isInterface() ) {
throw new SearchException(
"Wrong configuration of " + componentDescription + ": class " +
classToLoad.getName()
- + " does not implement interface " + targetSuperType.getName() );
+ + " does not implement interface " + targetSuperType.getName()
+ );
}
else {
throw new SearchException(
"Wrong configuration of " + componentDescription + ": class " +
classToLoad.getName()
- + " is not a subtype of " + targetSuperType.getName() );
+ + " is not a subtype of " + targetSuperType.getName()
+ );
}
}
else {
- return (T) instance;
+ return ( T ) instance;
}
}
- public static <T> T instanceFromConstructor(Class<T> targetSuperType,
Class<?> classToLoad, Class<?> parameterType, Object parameterValue, String
componentDescription) {
- checkClassType( classToLoad, componentDescription );
- //checkHasValidconstructor( classToLoad, componentDescription );
- Object instance = null;
+ public static Analyzer analyzerInstanceFromClass(Class<?> classToInstantiate,
Version luceneMatchVersion) {
+ checkClassType( classToInstantiate, "analyzer" );
+ Analyzer analyzerInstance;
+
+ // try to get a constructor with a version parameter
+ Constructor constructor;
+ boolean useVersionParameter = true;
try {
- Constructor constructor = classToLoad.getConstructor( parameterType );
- instance = constructor.newInstance( parameterValue );
+ constructor = classToInstantiate.getConstructor( Version.class );
}
+ catch ( NoSuchMethodException e ) {
+ try {
+ constructor = classToInstantiate.getConstructor();
+ useVersionParameter = false;
+ }
+ catch ( NoSuchMethodException nsme ) {
+ StringBuilder msg = new StringBuilder( "Unable to instantiate analyzer class:
" );
+ msg.append( classToInstantiate.getName() );
+ msg.append( ". Class neither has a default constructor nor a constructor with a
Version parameter" );
+ throw new SearchException( msg.toString(), e );
+ }
+ }
+
+ try {
+ if ( useVersionParameter ) {
+ analyzerInstance = ( Analyzer ) constructor.newInstance( luceneMatchVersion );
+ }
+ else {
+ analyzerInstance = ( Analyzer ) constructor.newInstance();
+ }
+ }
catch ( IllegalAccessException e ) {
throw new SearchException(
- "Unable to instantiate " + componentDescription + " class: " +
classToLoad.getName() +
- ". Class or constructor is not accessible.", e );
+ "Unable to instantiate analyzer class: " + classToInstantiate.getName() +
+ ". Class or constructor is not accessible.", e
+ );
}
catch ( InstantiationException e ) {
throw new SearchException(
- "Unable to instantiate " + componentDescription + " class: " +
classToLoad.getName() +
- ". Verify it has a no-args public constructor and is not abstract.", e );
+ "Unable to instantiate analyzer class: " + classToInstantiate.getName() +
+ ". Verify it has a no-args public constructor and is not abstract.", e
+ );
}
- catch ( NoSuchMethodException e ) {
- e.printStackTrace(); //To change body of catch statement use File | Settings | File
Templates.
- }
catch ( InvocationTargetException e ) {
- e.printStackTrace(); //To change body of catch statement use File | Settings | File
Templates.
+ throw new SearchException(
+ "Unable to instantiate analyzer class: " + classToInstantiate.getName() +
+ ". Verify it has a no-args public constructor and is not abstract.", e
+ );
}
- if ( ! targetSuperType.isInstance( instance ) ) {
- // have a proper error message according to interface implementation or subclassing
- if ( targetSuperType.isInterface() ) {
- throw new SearchException(
- "Wrong configuration of " + componentDescription + ": class " +
classToLoad.getName()
- + " does not implement interface " + targetSuperType.getName() );
- }
- else {
- throw new SearchException(
- "Wrong configuration of " + componentDescription + ": class " +
classToLoad.getName()
- + " is not a subtype of " + targetSuperType.getName() );
- }
- }
- else {
- return (T) instance;
- }
+ return analyzerInstance;
}
-
private static void checkClassType(Class<?> classToLoad, String
componentDescription) {
if ( classToLoad.isInterface() ) {
- throw new SearchException( classToLoad.getName() + " defined for component "
+ componentDescription
- + " is an interface: implementation required." );
+ throw new SearchException(
+ classToLoad.getName() + " defined for component " + componentDescription
+ + " is an interface: implementation required."
+ );
}
}
/**
* Verifies if target class has a no-args constructor, and that it is
* accessible in current security manager.
+ *
* @param classToLoad the class type to check
* @param componentDescription adds a meaningful description to the type to describe in
the
- * exception message
+ * exception message
*/
- public static void checkHasValidconstructor(Class<?> classToLoad, String
componentDescription) {
+ private static void checkHasNoArgConstructor(Class<?> classToLoad, String
componentDescription) {
try {
classToLoad.getConstructor();
- } catch (SecurityException e) {
- throw new SearchException( classToLoad.getName() + " defined for component "
+ componentDescription
- + " could not be instantiated because of a security manager error", e );
- } catch (NoSuchMethodException e) {
- throw new SearchException( classToLoad.getName() + " defined for component "
+ componentDescription
- + " is missing a no-arguments constructor" );
}
+ catch ( SecurityException e ) {
+ throw new SearchException(
+ classToLoad.getName() + " defined for component " + componentDescription
+ + " could not be instantiated because of a security manager error", e
+ );
+ }
+ catch ( NoSuchMethodException e ) {
+ throw new SearchException(
+ classToLoad.getName() + " defined for component " + componentDescription
+ + " is missing a no-arguments constructor"
+ );
+ }
}
+ private static Class<?> classForName(String classNameToLoad, Class<?>
caller, String componentDescription) {
+ Class<?> clazzDef;
+ try {
+ clazzDef = ReflectHelper.classForName( classNameToLoad, caller );
+ }
+ catch ( ClassNotFoundException e ) {
+ throw new SearchException(
+ "Unable to find " + componentDescription +
+ " implementation class: " + classNameToLoad, e
+ );
+ }
+ return clazzDef;
+ }
}
Modified:
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/ScopedAnalyzer.java
===================================================================
---
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/ScopedAnalyzer.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/main/java/org/hibernate/search/util/ScopedAnalyzer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,26 +1,25 @@
-/* $Id$
- *
+/*
* Hibernate, Relational Persistence for Idiomatic Java
- *
- * Copyright (c) 2009, Red Hat, Inc. and/or its affiliates or third-party contributors
as
- * indicated by the @author tags or express copyright attribution
- * statements applied by the authors. All third-party contributions are
- * distributed under license by Red Hat, Inc.
- *
- * This copyrighted material is made available to anyone wishing to use, modify,
- * copy, or redistribute it subject to the terms and conditions of the GNU
- * Lesser General Public License, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
- * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
- * for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this distribution; if not, write to:
- * Free Software Foundation, Inc.
- * 51 Franklin Street, Fifth Floor
- * Boston, MA 02110-1301 USA
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
*/
package org.hibernate.search.util;
Modified:
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/DoubleAnalyzerTest.java
===================================================================
---
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/DoubleAnalyzerTest.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/DoubleAnalyzerTest.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -80,5 +80,4 @@
tx.commit();
s.close();
}
-
}
Modified:
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/inheritance/ISOLatin1Analyzer.java
===================================================================
---
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/inheritance/ISOLatin1Analyzer.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/inheritance/ISOLatin1Analyzer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -29,7 +29,6 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardTokenizer;
-import org.apache.lucene.util.Version;
import org.hibernate.search.test.SearchTestCase;
@@ -38,9 +37,6 @@
*/
public final class ISOLatin1Analyzer extends Analyzer {
- public ISOLatin1Analyzer(Version version) {
- }
-
public TokenStream tokenStream(String s, Reader reader) {
TokenStream result = new StandardTokenizer( SearchTestCase.getTargetLuceneVersion(),
reader );
return new ASCIIFoldingFilter( result );
Modified:
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java
===================================================================
---
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/analyzer/solr/InsertWhitespaceFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -43,7 +43,7 @@
public InsertWhitespaceFilter(TokenStream in) {
super( in );
- termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ termAtt = addAttribute(TermAttribute.class);
}
@Override
Modified:
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/DSLTest.java
===================================================================
---
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/DSLTest.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/DSLTest.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,3 +1,26 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
package org.hibernate.search.test.query.dsl;
import java.util.Calendar;
@@ -383,12 +406,12 @@
query = monthQb.
phrase()
- .withSlop( 1 )
+ .withSlop( 3 )
.onField( "mythology" )
.sentence( "Month whitening" )
.createQuery();
-// assertEquals( "test slop", 1, fts.createFullTextQuery( query, Month.class
).getResultSize() );
+ assertEquals( "test slop", 1, fts.createFullTextQuery( query, Month.class
).getResultSize() );
query = monthQb.
phrase()
Modified:
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/Month.java
===================================================================
---
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/Month.java 2010-09-17
09:09:44 UTC (rev 20658)
+++
search/trunk/hibernate-search/src/test/java/org/hibernate/search/test/query/dsl/Month.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -1,3 +1,27 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
+
package org.hibernate.search.test.query.dsl;
import java.util.Date;
Modified: search/trunk/hibernate-search-integrationtest/pom.xml
===================================================================
--- search/trunk/hibernate-search-integrationtest/pom.xml 2010-09-17 09:09:44 UTC (rev
20658)
+++ search/trunk/hibernate-search-integrationtest/pom.xml 2010-09-17 09:15:09 UTC (rev
20659)
@@ -46,7 +46,7 @@
<!-- Hibernate -->
<dependency>
- <groupId>${groupId}</groupId>
+ <groupId>${project.groupId}</groupId>
<artifactId>hibernate-search</artifactId>
<version>${project.version}</version>
<scope>test</scope>
@@ -68,12 +68,6 @@
<version>3.12.0.GA</version>
<scope>test</scope>
</dependency>
- <dependency>
- <groupId>cglib</groupId>
- <artifactId>cglib</artifactId>
- <version>2.2</version>
- <scope>test</scope>
- </dependency>
<!-- test -->
<dependency>
Added: search/trunk/hibernate-search-solr-analyzers/pom.xml
===================================================================
--- search/trunk/hibernate-search-solr-analyzers/pom.xml (rev 0)
+++ search/trunk/hibernate-search-solr-analyzers/pom.xml 2010-09-17 09:15:09 UTC (rev
20659)
@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2009, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ -->
+<project
xmlns="http://maven.apache.org/POM/4.0.0"
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>hibernate-search-parent</artifactId>
+ <groupId>org.hibernate</groupId>
+ <version>3.3.0-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+
+ <artifactId>hibernate-search-solr-analyzers</artifactId>
+
+ <name>Hibernate Search Analyzer Framework</name>
+ <description>Hibernate Search Analyzer Framework</description>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-analyzers</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-snowball</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-collation</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-spellchecker</artifactId>
+ </dependency>
+
+ <!-- Apache Commons -->
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <testResources>
+ <testResource>
+ <filtering>true</filtering>
+ <directory>src/test/resources</directory>
+ <includes>
+ <include>**/*.properties</include>
+ <include>**/*.xml</include>
+ </includes>
+ </testResource>
+ </testResources>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <forkMode>once</forkMode>
+
<redirectTestOutputToFile>true</redirectTestOutputToFile>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-source-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+</project>
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.ASCIIFoldingFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory {
+ public ASCIIFoldingFilter create(TokenStream input) {
+ return new ASCIIFoldingFilter( input );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,28 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
+
+public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory {
+
+ public ArabicLetterTokenizer create(Reader input) {
+ return new ArabicLetterTokenizer( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,32 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
+
+
+/**
+ *
+ *
+ **/
+public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory {
+
+ public ArabicNormalizationFilter create(TokenStream input) {
+ return new ArabicNormalizationFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ArabicStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,32 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ar.ArabicStemFilter;
+
+
+/**
+ *
+ *
+ **/
+public class ArabicStemFilterFactory extends BaseTokenFilterFactory {
+
+ public ArabicStemFilter create(TokenStream input) {
+ return new ArabicStemFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseCharFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseCharFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseCharFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @version $Id: BaseCharFilterFactory.java 890798 2009-12-15 14:13:59Z koji $
+ * @since Solr 1.4
+ */
+public abstract class BaseCharFilterFactory implements CharFilterFactory {
+
+ public static final Logger log = LoggerFactory.getLogger( BaseCharFilterFactory.class
);
+
+ /**
+ * The init args
+ */
+ protected Map<String, String> args;
+
+ public Map<String, String> getArgs() {
+ return args;
+ }
+
+ public void init(Map<String, String> args) {
+ this.args = args;
+ }
+
+ protected int getInt(String name) {
+ return getInt( name, -1, false );
+ }
+
+ protected int getInt(String name, int defaultVal) {
+ return getInt( name, defaultVal, true );
+ }
+
+ protected int getInt(String name, int defaultVal, boolean useDefault) {
+ String s = args.get( name );
+ if ( s == null ) {
+ if ( useDefault ) {
+ return defaultVal;
+ }
+ throw new RuntimeException( "Configuration Error: missing parameter '" +
name + "'" );
+ }
+ return Integer.parseInt( s );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Simple abstract implementation that handles init arg processing.
+ *
+ * @version $Id: BaseTokenFilterFactory.java 923109 2010-03-15 08:09:34Z uschindler $
+ */
+public abstract class BaseTokenFilterFactory extends BaseTokenStreamFactory implements
TokenFilterFactory {
+ public static final Logger log = LoggerFactory.getLogger( BaseTokenFilterFactory.class
);
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenStreamFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenStreamFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenStreamFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,141 @@
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.util.Version;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.Constants;
+
+
+/**
+ * Simple abstract implementation that handles init arg processing, is not really
+ * a factory as it implements no interface, but removes code duplication
+ * in its subclasses.
+ *
+ * @version $Id: BaseTokenStreamFactory.java 929782 2010-04-01 02:15:27Z rmuir $
+ */
+abstract class BaseTokenStreamFactory {
+ /**
+ * The init args
+ */
+ protected Map<String, String> args;
+
+ /**
+ * the luceneVersion arg
+ */
+ protected Version luceneMatchVersion = null;
+
+ public void init(Map<String, String> args) {
+ this.args = args;
+ String matchVersion = args.get( Constants.LUCENE_MATCH_VERSION_PARAM );
+ if ( matchVersion != null ) {
+ luceneMatchVersion = parseLuceneVersionString( matchVersion );
+ }
+ }
+
+ public Map<String, String> getArgs() {
+ return args;
+ }
+
+ /**
+ * this method can be called in the {@link #create} method,
+ * to inform user, that for this factory a {@link #luceneMatchVersion} is required
+ */
+ protected final void assureMatchVersion() {
+ if ( luceneMatchVersion == null ) {
+ throw new RuntimeException(
+ "Configuration Error: Factory '" + this.getClass().getName() +
+ "' needs a 'luceneMatchVersion' parameter"
+ );
+ }
+ }
+
+ // TODO: move these somewhere that tokenizers and others
+ // can also use them...
+
+ protected int getInt(String name) {
+ return getInt( name, -1, false );
+ }
+
+ protected int getInt(String name, int defaultVal) {
+ return getInt( name, defaultVal, true );
+ }
+
+ protected int getInt(String name, int defaultVal, boolean useDefault) {
+ String s = args.get( name );
+ if ( s == null ) {
+ if ( useDefault ) {
+ return defaultVal;
+ }
+ throw new RuntimeException( "Configuration Error: missing parameter '" +
name + "'" );
+ }
+ return Integer.parseInt( s );
+ }
+
+ protected boolean getBoolean(String name, boolean defaultVal) {
+ return getBoolean( name, defaultVal, true );
+ }
+
+ protected boolean getBoolean(String name, boolean defaultVal, boolean useDefault) {
+ String s = args.get( name );
+ if ( s == null ) {
+ if ( useDefault ) {
+ return defaultVal;
+ }
+ throw new RuntimeException( "Configuration Error: missing parameter '" +
name + "'" );
+ }
+ return Boolean.parseBoolean( s );
+ }
+
+ protected CharArraySet getWordSet(ResourceLoader loader,
+ String wordFiles, boolean ignoreCase) throws IOException {
+ assureMatchVersion();
+ List<String> files = StrUtils.splitFileNames( wordFiles );
+ CharArraySet words = null;
+ if ( files.size() > 0 ) {
+ // default stopwords list has 35 or so words, but maybe don't make it that
+ // big to start
+ words = new CharArraySet( files.size() * 10, ignoreCase );
+ for ( String file : files ) {
+ List<String> wlist = loader.getLines( file.trim() );
+ words.addAll(
+ StopFilter.makeStopSet(
+ wlist,
+ ignoreCase
+ )
+ );
+ }
+ }
+ return words;
+ }
+
+ private Version parseLuceneVersionString(final String matchVersion) {
+ String parsedMatchVersion = matchVersion.toUpperCase( Locale.ENGLISH );
+
+ // be lenient with the supplied version parameter
+ parsedMatchVersion = parsedMatchVersion.replaceFirst( "^(\\d)\\.(\\d)$",
"LUCENE_$1$2" );
+
+ final Version version;
+ try {
+ version = Version.valueOf( parsedMatchVersion );
+ }
+ catch ( IllegalArgumentException iae ) {
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR,
+ "Invalid luceneMatchVersion '" + matchVersion +
+ "', valid values are: " + Arrays.toString( Version.values() ) +
+ " or a string in format 'V.V'", iae, false
+ );
+ }
+
+ return version;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BaseTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Simple abstract implementation that handles init arg processing.
+ *
+ * @version $Id: BaseTokenizerFactory.java 923109 2010-03-15 08:09:34Z uschindler $
+ */
+public abstract class BaseTokenizerFactory extends BaseTokenStreamFactory implements
TokenizerFactory {
+ public static final Logger log = LoggerFactory.getLogger( BaseTokenizerFactory.class );
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.br.BrazilianStemFilter;
+
+public class BrazilianStemFilterFactory extends BaseTokenFilterFactory {
+ public BrazilianStemFilter create(TokenStream in) {
+ return new BrazilianStemFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BufferedTokenStream.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BufferedTokenStream.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/BufferedTokenStream.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Handles input and output buffering of TokenStream
+ * <p/>
+ * <pre>
+ * // Example of a class implementing the rule "A" "B" =>
"Q" "B"
+ * class MyTokenStream extends BufferedTokenStream {
+ * public MyTokenStream(TokenStream input) {super(input);}
+ * protected Token process(Token t) throws IOException {
+ * if ("A".equals(t.termText())) {
+ * Token t2 = read();
+ * if (t2!=null && "B".equals(t2.termText()))
t.setTermText("Q");
+ * if (t2!=null) pushBack(t2);
+ * }
+ * return t;
+ * }
+ * }
+ * <p/>
+ * // Example of a class implementing "A" "B" => "A"
"A" "B"
+ * class MyTokenStream extends BufferedTokenStream {
+ * public MyTokenStream(TokenStream input) {super(input);}
+ * protected Token process(Token t) throws IOException {
+ * if ("A".equals(t.termText()) &&
"B".equals(peek(1).termText()))
+ * write((Token)t.clone());
+ * return t;
+ * }
+ * }
+ * </pre>
+ * <p/>
+ * NOTE: BufferedTokenStream does not clone() any Tokens. This is instead the
+ * responsibility of the implementing subclass. In the "A" "B" =>
"A" "A" "B"
+ * example above, the subclass must clone the additional "A" it creates.
+ *
+ * @deprecated This class does not support custom attributes. Extend TokenFilter
instead,
+ * using {@link AttributeSource#captureState()} and {@link
AttributeSource#restoreState()}
+ * which support all attributes.
+ */
+@Deprecated
+public abstract class BufferedTokenStream extends TokenFilter {
+ // in the future, might be faster if we implemented as an array based CircularQueue
+ private final LinkedList<Token> inQueue = new LinkedList<Token>();
+ private final LinkedList<Token> outQueue = new LinkedList<Token>();
+
+ private final TermAttribute termAtt = addAttribute( TermAttribute.class );
+ private final OffsetAttribute offsetAtt = addAttribute( OffsetAttribute.class );
+ private final TypeAttribute typeAtt = addAttribute( TypeAttribute.class );
+ private final FlagsAttribute flagsAtt = addAttribute( FlagsAttribute.class );
+ private final PayloadAttribute payloadAtt = addAttribute( PayloadAttribute.class );
+ private final PositionIncrementAttribute posIncAtt = addAttribute(
PositionIncrementAttribute.class );
+
+ public BufferedTokenStream(TokenStream input) {
+ super( input );
+ }
+
+ /**
+ * Process a token. Subclasses may read more tokens from the input stream,
+ * write more tokens to the output stream, or simply return the next token
+ * to be output. Subclasses may return null if the token is to be dropped.
+ * If a subclass writes tokens to the output stream and returns a
+ * non-null Token, the returned Token is considered to be at the head of
+ * the token output stream.
+ */
+ protected abstract Token process(Token t) throws IOException;
+
+ public final boolean incrementToken() throws IOException {
+ while ( true ) {
+ if ( !outQueue.isEmpty() ) {
+ return writeToken( outQueue.removeFirst() );
+ }
+ Token t = read();
+ if ( null == t ) {
+ return false;
+ }
+ Token out = process( t );
+ if ( null != out ) {
+ return writeToken( out );
+ }
+ // loop back to top in case process() put something on the output queue
+ }
+ }
+
+ /**
+ * Read a token from the buffered input stream.
+ *
+ * @return null at EOS
+ */
+ protected Token read() throws IOException {
+ if ( inQueue.isEmpty() ) {
+ Token t = readToken();
+ return t;
+ }
+ return inQueue.removeFirst();
+ }
+
+ /**
+ * Push a token back into the buffered input stream, such that it will
+ * be returned by a future call to <code>read()</code>
+ */
+ protected void pushBack(Token t) {
+ inQueue.addFirst( t );
+ }
+
+ /**
+ * Peek n tokens ahead in the buffered input stream, without modifying
+ * the stream.
+ *
+ * @param n Number of tokens into the input stream to peek, 1 based ...
+ * 0 is invalid
+ *
+ * @return a Token which exists in the input stream, any modifications
+ * made to this Token will be "real" if/when the Token is
+ * <code>read()</code> from the stream.
+ */
+ protected Token peek(int n) throws IOException {
+ int fillCount = n - inQueue.size();
+ for ( int i = 0; i < fillCount; i++ ) {
+ Token t = readToken();
+ if ( null == t ) {
+ return null;
+ }
+ inQueue.addLast( t );
+ }
+ return inQueue.get( n - 1 );
+ }
+
+ /**
+ * old api emulation for back compat
+ */
+ private Token readToken() throws IOException {
+ if ( !input.incrementToken() ) {
+ return null;
+ }
+ else {
+ Token token = new Token();
+ token.setTermBuffer( termAtt.termBuffer(), 0, termAtt.termLength() );
+ token.setOffset( offsetAtt.startOffset(), offsetAtt.endOffset() );
+ token.setType( typeAtt.type() );
+ token.setFlags( flagsAtt.getFlags() );
+ token.setPositionIncrement( posIncAtt.getPositionIncrement() );
+ token.setPayload( payloadAtt.getPayload() );
+ return token;
+ }
+ }
+
+ /**
+ * old api emulation for back compat
+ */
+ private boolean writeToken(Token token) throws IOException {
+ clearAttributes();
+ termAtt.setTermBuffer( token.termBuffer(), 0, token.termLength() );
+ offsetAtt.setOffset( token.startOffset(), token.endOffset() );
+ typeAtt.setType( token.type() );
+ flagsAtt.setFlags( token.getFlags() );
+ posIncAtt.setPositionIncrement( token.getPositionIncrement() );
+ payloadAtt.setPayload( token.getPayload() );
+ return true;
+ }
+
+ /**
+ * Write a token to the buffered output stream
+ */
+ protected void write(Token t) {
+ outQueue.addLast( t );
+ }
+
+ /**
+ * Provides direct Iterator access to the buffered output stream.
+ * Modifying any token in this Iterator will affect the resulting stream.
+ */
+ protected Iterable<Token> output() {
+ return outQueue;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ inQueue.clear();
+ outQueue.clear();
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CJKTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CJKTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CJKTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.cjk.CJKTokenizer;
+
+public class CJKTokenizerFactory extends BaseTokenizerFactory {
+ public CJKTokenizer create(Reader in) {
+ return new CJKTokenizer( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CapitalizationFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CapitalizationFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CapitalizationFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,246 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+import java.util.StringTokenizer;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A filter to apply normal capitalization rules to Tokens. It will make the first
letter
+ * capital and the rest lower case.
+ * <p/>
+ * This filter is particularly useful to build nice looking facet parameters. This
filter
+ * is not appropriate if you intend to use a prefix query.
+ * <p/>
+ * The factory takes parameters:<br/>
+ * "onlyFirstWord" - should each word be capitalized or all of the
words?<br/>
+ * "keep" - a keep word list. Each word that should be kept separated by
whitespace.<br/>
+ * "keepIgnoreCase - true or false. If true, the keep list will be considered
case-insensitive.
+ * "forceFirstLetter" - Force the first letter to be capitalized even if it is
in the keep list<br/>
+ * "okPrefix" - do not change word capitalization if a word begins with
something in this list.
+ * for example if "McK" is on the okPrefix list, the word "McKinley"
should not be changed to
+ * "Mckinley"<br/>
+ * "minWordLength" - how long the word needs to be to get capitalization
applied. If the
+ * minWordLength is 3, "and" > "And" but "or" stays
"or"<br/>
+ * "maxWordCount" - if the token contains more then maxWordCount words, the
capitalization is
+ * assumed to be correct.<br/>
+ *
+ * @version $Id: CapitalizationFilterFactory.java 891596 2009-12-17 09:19:06Z shalin $
+ * @since solr 1.3
+ */
+public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
+ public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
+ public static final String KEEP = "keep";
+ public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
+ public static final String OK_PREFIX = "okPrefix";
+ public static final String MIN_WORD_LENGTH = "minWordLength";
+ public static final String MAX_WORD_COUNT = "maxWordCount";
+ public static final String MAX_TOKEN_LENGTH = "maxTokenLength";
+ public static final String ONLY_FIRST_WORD = "onlyFirstWord";
+ public static final String FORCE_FIRST_LETTER = "forceFirstLetter";
+
+ //Map<String,String> keep = new HashMap<String, String>(); // not
synchronized because it is only initialized once
+ CharArraySet keep;
+
+ Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK
+
+ int minWordLength = 0; // don't modify capitalization for words shorter then this
+ int maxWordCount = DEFAULT_MAX_WORD_COUNT;
+ int maxTokenLength = DEFAULT_MAX_WORD_COUNT;
+ boolean onlyFirstWord = true;
+ boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is
in the keep list
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+
+ String k = args.get( KEEP );
+ if ( k != null ) {
+ StringTokenizer st = new StringTokenizer( k );
+ boolean ignoreCase = false;
+ String ignoreStr = args.get( KEEP_IGNORE_CASE );
+ if ( "true".equalsIgnoreCase( ignoreStr ) ) {
+ ignoreCase = true;
+ }
+ keep = new CharArraySet( 10, ignoreCase );
+ while ( st.hasMoreTokens() ) {
+ k = st.nextToken().trim();
+ keep.add( k.toCharArray() );
+ }
+ }
+
+ k = args.get( OK_PREFIX );
+ if ( k != null ) {
+ okPrefix = new ArrayList<char[]>();
+ StringTokenizer st = new StringTokenizer( k );
+ while ( st.hasMoreTokens() ) {
+ okPrefix.add( st.nextToken().trim().toCharArray() );
+ }
+ }
+
+ k = args.get( MIN_WORD_LENGTH );
+ if ( k != null ) {
+ minWordLength = Integer.valueOf( k );
+ }
+
+ k = args.get( MAX_WORD_COUNT );
+ if ( k != null ) {
+ maxWordCount = Integer.valueOf( k );
+ }
+
+ k = args.get( MAX_TOKEN_LENGTH );
+ if ( k != null ) {
+ maxTokenLength = Integer.valueOf( k );
+ }
+
+ k = args.get( ONLY_FIRST_WORD );
+ if ( k != null ) {
+ onlyFirstWord = Boolean.valueOf( k );
+ }
+
+ k = args.get( FORCE_FIRST_LETTER );
+ if ( k != null ) {
+ forceFirstLetter = Boolean.valueOf( k );
+ }
+ }
+
+
+ public void processWord(char[] buffer, int offset, int length, int wordCount) {
+ if ( length < 1 ) {
+ return;
+ }
+ if ( onlyFirstWord && wordCount > 0 ) {
+ for ( int i = 0; i < length; i++ ) {
+ buffer[offset + i] = Character.toLowerCase( buffer[offset + i] );
+
+ }
+ return;
+ }
+
+ if ( keep != null && keep.contains( buffer, offset, length ) ) {
+ if ( wordCount == 0 && forceFirstLetter ) {
+ buffer[offset] = Character.toUpperCase( buffer[offset] );
+ }
+ return;
+ }
+ if ( length < minWordLength ) {
+ return;
+ }
+ for ( char[] prefix : okPrefix ) {
+ if ( length >= prefix.length ) { //don't bother checking if the buffer length
is less than the prefix
+ boolean match = true;
+ for ( int i = 0; i < prefix.length; i++ ) {
+ if ( prefix[i] != buffer[offset + i] ) {
+ match = false;
+ break;
+ }
+ }
+ if ( match == true ) {
+ return;
+ }
+ }
+ }
+
+ // We know it has at least one character
+ /*char[] chars = w.toCharArray();
+ StringBuilder word = new StringBuilder( w.length() );
+ word.append( Character.toUpperCase( chars[0] ) );*/
+ buffer[offset] = Character.toUpperCase( buffer[offset] );
+
+ for ( int i = 1; i < length; i++ ) {
+ buffer[offset + i] = Character.toLowerCase( buffer[offset + i] );
+ }
+ //return word.toString();
+ }
+
+ public CapitalizationFilter create(TokenStream input) {
+ return new CapitalizationFilter( input, this );
+ }
+}
+
+
+/**
+ * This relies on the Factory so that the difficult stuff does not need to be
+ * re-initialized each time the filter runs.
+ * <p/>
+ * This is package protected since it is not useful without the Factory
+ */
+class CapitalizationFilter extends TokenFilter {
+ private final CapitalizationFilterFactory factory;
+ private final TermAttribute termAtt;
+
+ public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory)
{
+ super( in );
+ this.factory = factory;
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ char[] termBuffer = termAtt.termBuffer();
+ int termBufferLength = termAtt.termLength();
+ char[] backup = null;
+ if ( factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT ) {
+ //make a backup in case we exceed the word count
+ backup = new char[termBufferLength];
+ System.arraycopy( termBuffer, 0, backup, 0, termBufferLength );
+ }
+ if ( termBufferLength < factory.maxTokenLength ) {
+ int wordCount = 0;
+
+ int lastWordStart = 0;
+ for ( int i = 0; i < termBufferLength; i++ ) {
+ char c = termBuffer[i];
+ if ( c <= ' ' || c == '.' ) {
+ int len = i - lastWordStart;
+ if ( len > 0 ) {
+ factory.processWord( termBuffer, lastWordStart, len, wordCount++ );
+ lastWordStart = i + 1;
+ i++;
+ }
+ }
+ }
+
+ // process the last word
+ if ( lastWordStart < termBufferLength ) {
+ factory.processWord( termBuffer, lastWordStart, termBufferLength - lastWordStart,
wordCount++ );
+ }
+
+ if ( wordCount > factory.maxWordCount ) {
+ termAtt.setTermBuffer( backup, 0, termBufferLength );
+ }
+ }
+
+ return true;
+ }
+
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CharFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CharFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CharFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * @version $Id: CharFilterFactory.java 794328 2009-07-15 17:21:04Z shalin $
+ * @since Solr 1.4
+ */
+public interface CharFilterFactory {
+ public void init(Map<String, String> args);
+
+ public Map<String, String> getArgs();
+
+ public CharStream create(CharStream input);
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.ChineseFilter;
+
+public class ChineseFilterFactory extends BaseTokenFilterFactory {
+ public ChineseFilter create(TokenStream in) {
+ return new ChineseFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ChineseTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.cn.ChineseTokenizer;
+
+public class ChineseTokenizerFactory extends BaseTokenizerFactory {
+ public ChineseTokenizer create(Reader in) {
+ return new ChineseTokenizer( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CollationKeyFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CollationKeyFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,172 @@
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.text.Collator;
+import java.text.ParseException;
+import java.text.RuleBasedCollator;
+import java.util.Locale;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.collation.CollationKeyFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Factory for {@link CollationKeyFilter}.
+ * <p/>
+ * This factory can be created in two ways:
+ * <ul>
+ * <li>Based upon a system collator associated with a Locale.
+ * <li>Based upon a tailored ruleset.
+ * </ul>
+ * <p/>
+ * Using a System collator:
+ * <ul>
+ * <li>language: ISO-639 language code (mandatory)
+ * <li>country: ISO-3166 country code (optional)
+ * <li>variant: vendor or browser-specific code (optional)
+ * <li>strength: 'primary','secondary','tertiary', or
'identical' (optional)
+ * <li>decomposition: 'no','canonical', or 'full'
(optional)
+ * </ul>
+ * <p/>
+ * Using a Tailored ruleset:
+ * <ul>
+ * <li>custom: UTF-8 text file containing rules supported by RuleBasedCollator
(mandatory)
+ * <li>strength: 'primary','secondary','tertiary', or
'identical' (optional)
+ * <li>decomposition: 'no','canonical', or 'full'
(optional)
+ * </ul>
+ *
+ * @see Collator
+ * @see Locale
+ * @see RuleBasedCollator
+ * @since solr 1.5
+ */
+public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+ private Collator collator;
+
+ public void inform(ResourceLoader loader) {
+ String custom = args.get( "custom" );
+ String language = args.get( "language" );
+ String country = args.get( "country" );
+ String variant = args.get( "variant" );
+ String strength = args.get( "strength" );
+ String decomposition = args.get( "decomposition" );
+
+ if ( custom == null && language == null ) {
+ throw new SolrException( ErrorCode.SERVER_ERROR, "Either custom or language is
required." );
+ }
+
+ if ( custom != null &&
+ ( language != null || country != null || variant != null ) ) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR, "Cannot specify both language and custom. "
+ + "To tailor rules for a built-in language, see the javadocs for
RuleBasedCollator. "
+ + "Then save the entire customized ruleset to a file, and use with the custom
parameter"
+ );
+ }
+
+ if ( language != null ) {
+ // create from a system collator, based on Locale.
+ collator = createFromLocale( language, country, variant );
+ }
+ else {
+ // create from a custom ruleset
+ collator = createFromRules( custom, loader );
+ }
+
+ // set the strength flag, otherwise it will be the default.
+ if ( strength != null ) {
+ if ( strength.equalsIgnoreCase( "primary" ) ) {
+ collator.setStrength( Collator.PRIMARY );
+ }
+ else if ( strength.equalsIgnoreCase( "secondary" ) ) {
+ collator.setStrength( Collator.SECONDARY );
+ }
+ else if ( strength.equalsIgnoreCase( "tertiary" ) ) {
+ collator.setStrength( Collator.TERTIARY );
+ }
+ else if ( strength.equalsIgnoreCase( "identical" ) ) {
+ collator.setStrength( Collator.IDENTICAL );
+ }
+ else {
+ throw new SolrException( ErrorCode.SERVER_ERROR, "Invalid strength: " +
strength );
+ }
+ }
+
+ // set the decomposition flag, otherwise it will be the default.
+ if ( decomposition != null ) {
+ if ( decomposition.equalsIgnoreCase( "no" ) ) {
+ collator.setDecomposition( Collator.NO_DECOMPOSITION );
+ }
+ else if ( decomposition.equalsIgnoreCase( "canonical" ) ) {
+ collator.setDecomposition( Collator.CANONICAL_DECOMPOSITION );
+ }
+ else if ( decomposition.equalsIgnoreCase( "full" ) ) {
+ collator.setDecomposition( Collator.FULL_DECOMPOSITION );
+ }
+ else {
+ throw new SolrException( ErrorCode.SERVER_ERROR, "Invalid decomposition: "
+ decomposition );
+ }
+ }
+ }
+
+ public TokenStream create(TokenStream input) {
+ return new CollationKeyFilter( input, collator );
+ }
+
+ /*
+ * Create a locale from language, with optional country and variant.
+ * Then return the appropriate collator for the locale.
+ */
+
+ private Collator createFromLocale(String language, String country, String variant) {
+ Locale locale;
+
+ if ( language != null && country == null && variant != null ) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "To specify variant, country is required"
+ );
+ }
+ else if ( language != null && country != null && variant != null ) {
+ locale = new Locale( language, country, variant );
+ }
+ else if ( language != null && country != null ) {
+ locale = new Locale( language, country );
+ }
+ else {
+ locale = new Locale( language );
+ }
+
+ return Collator.getInstance( locale );
+ }
+
+ /*
+ * Read custom rules from a file, and create a RuleBasedCollator
+ * The file cannot support comments, as # might be in the rules!
+ */
+
+ private Collator createFromRules(String fileName, ResourceLoader loader) {
+ InputStream input = null;
+ try {
+ input = loader.openResource( fileName );
+ String rules = IOUtils.toString( input, "UTF-8" );
+ return new RuleBasedCollator( rules );
+ }
+ catch ( IOException e ) {
+ // io error
+ throw new RuntimeException( e );
+ }
+ catch ( ParseException e ) {
+ // invalid rules
+ throw new RuntimeException( e );
+ }
+ finally {
+ IOUtils.closeQuietly( input );
+ }
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,282 @@
+/*
+ * Licensed under the Apache License,
+ * Version 2.0 (the "License"); you may not use this file except in compliance
with the License.
+ * You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software distributed under
the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied.
+ * See the License for the specific language governing permissions and limitations under
the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.Version;
+
+/*
+ * TODO: Consider implementing
https://issues.apache.org/jira/browse/LUCENE-1688 changes
to stop list and associated constructors
+ */
+
+/**
+ * Construct bigrams for frequently occurring terms while indexing. Single terms
+ * are still indexed too, with bigrams overlaid. This is achieved through the
+ * use of {@link PositionIncrementAttribute#setPositionIncrement(int)}. Bigrams have a
type
+ * of {@link #GRAM_TYPE} Example:
+ * <ul>
+ * <li>input:"the quick brown fox"</li>
+ *
<li>output:|"the","the-quick"|"brown"|"fox"|</li>
+ * <li>"the-quick" has a position increment of 0 so it is in the same
position
+ * as "the" "the-quick" has a term.type() of
"gram"</li>
+ * <p/>
+ * </ul>
+ */
+
+/*
+ * Constructors and makeCommonSet based on similar code in StopFilter
+ */
+public final class CommonGramsFilter extends TokenFilter {
+
+ static final String GRAM_TYPE = "gram";
+ private static final char SEPARATOR = '_';
+
+ private final CharArraySet commonWords;
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ private final TermAttribute termAttribute = addAttribute( TermAttribute.class );
+ private final OffsetAttribute offsetAttribute = addAttribute( OffsetAttribute.class );
+ private final TypeAttribute typeAttribute = addAttribute( TypeAttribute.class );
+ private final PositionIncrementAttribute posIncAttribute = addAttribute(
PositionIncrementAttribute.class );
+
+ private int lastStartOffset;
+ private boolean lastWasCommon;
+ private State savedState;
+
+ /**
+ * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead
+ */
+ public CommonGramsFilter(TokenStream input, Set<?> commonWords) {
+ this( Version.LUCENE_29, input, commonWords );
+ }
+
+ /**
+ * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)}
instead
+ */
+ public CommonGramsFilter(TokenStream input, Set<?> commonWords, boolean
ignoreCase) {
+ this( Version.LUCENE_29, input, commonWords, ignoreCase );
+ }
+
+ /**
+ * Construct a token stream filtering the given input using a Set of common
+ * words to create bigrams. Outputs both unigrams with position increment and
+ * bigrams with position increment 0 type=gram where one or both of the words
+ * in a potential bigram are in the set of common words .
+ *
+ * @param input TokenStream input in filter chain
+ * @param commonWords The set of common words.
+ */
+ public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?>
commonWords) {
+ this( matchVersion, input, commonWords, false );
+ }
+
+ /**
+ * Construct a token stream filtering the given input using a Set of common
+ * words to create bigrams, case-sensitive if ignoreCase is false (unless Set
+ * is CharArraySet). If <code>commonWords</code> is an instance of
+ * {@link CharArraySet} (true if <code>makeCommonSet()</code> was used to
+ * construct the set) it will be directly used and <code>ignoreCase</code>
+ * will be ignored since <code>CharArraySet</code> directly controls case
+ * sensitivity.
+ * <p/>
+ * If <code>commonWords</code> is not an instance of {@link CharArraySet},
a
+ * new CharArraySet will be constructed and <code>ignoreCase</code> will be
+ * used to specify the case sensitivity of that set.
+ *
+ * @param input TokenStream input in filter chain.
+ * @param commonWords The set of common words.
+ * @param ignoreCase -Ignore case when constructing bigrams for common words.
+ */
+ public CommonGramsFilter(Version matchVersion, TokenStream input, Set<?>
commonWords, boolean ignoreCase) {
+ super( input );
+ if ( commonWords instanceof CharArraySet ) {
+ this.commonWords = ( CharArraySet ) commonWords;
+ }
+ else {
+ this.commonWords = new CharArraySet( commonWords.size(), ignoreCase );
+ this.commonWords.addAll( commonWords );
+ }
+ }
+
+ /**
+ * Construct a token stream filtering the given input using an Array of common
+ * words to create bigrams.
+ *
+ * @param input Tokenstream in filter chain
+ * @param commonWords words to be used in constructing bigrams
+ *
+ * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set)} instead.
+ */
+ @Deprecated
+ public CommonGramsFilter(TokenStream input, String[] commonWords) {
+ this( input, commonWords, false );
+ }
+
+ /**
+ * Construct a token stream filtering the given input using an Array of common
+ * words to create bigrams and is case-sensitive if ignoreCase is false.
+ *
+ * @param input Tokenstream in filter chain
+ * @param commonWords words to be used in constructing bigrams
+ * @param ignoreCase -Ignore case when constructing bigrams for common words.
+ *
+ * @deprecated Use {@link #CommonGramsFilter(Version, TokenStream, Set, boolean)}
instead.
+ */
+ @Deprecated
+ public CommonGramsFilter(TokenStream input, String[] commonWords, boolean ignoreCase) {
+ super( input );
+ this.commonWords = makeCommonSet( commonWords, ignoreCase );
+ }
+
+ /**
+ * Build a CharArraySet from an array of common words, appropriate for passing
+ * into the CommonGramsFilter constructor. This permits this commonWords
+ * construction to be cached once when an Analyzer is constructed.
+ *
+ * @param commonWords Array of common words which will be converted into the
CharArraySet
+ *
+ * @return CharArraySet of the given words, appropriate for passing into the
CommonGramFilter constructor
+ *
+ * @see #makeCommonSet(java.lang.String[], boolean) passing false to ignoreCase
+ * @deprecated create a CharArraySet with CharArraySet instead
+ */
+ @Deprecated
+ public static CharArraySet makeCommonSet(String[] commonWords) {
+ return makeCommonSet( commonWords, false );
+ }
+
+ /**
+ * Build a CharArraySet from an array of common words, appropriate for passing
+ * into the CommonGramsFilter constructor,case-sensitive if ignoreCase is
+ * false.
+ *
+ * @param commonWords Array of common words which will be converted into the
CharArraySet
+ * @param ignoreCase If true, all words are lower cased first.
+ *
+ * @return a Set containing the words
+ *
+ * @deprecated create a CharArraySet with CharArraySet instead
+ */
+ @Deprecated
+ public static CharArraySet makeCommonSet(String[] commonWords, boolean ignoreCase) {
+ CharArraySet commonSet = new CharArraySet( commonWords.length, ignoreCase );
+ commonSet.addAll( Arrays.asList( commonWords ) );
+ return commonSet;
+ }
+
+ /**
+ * Inserts bigrams for common words into a token stream. For each input token,
+ * output the token. If the token and/or the following token are in the list
+ * of common words also output a bigram with position increment 0 and
+ * type="gram"
+ * <p/>
+ * TODO:Consider adding an option to not emit unigram stopwords
+ * as in CDL XTF BigramStopFilter, CommonGramsQueryFilter would need to be
+ * changed to work with this.
+ * <p/>
+ * TODO: Consider optimizing for the case of three
+ * commongrams i.e "man of the year" normally produces 3 bigrams:
"man-of",
+ * "of-the", "the-year" but with proper management of positions we
could
+ * eliminate the middle bigram "of-the"and save a disk seek and a whole set
of
+ * position lookups.
+ */
+ public boolean incrementToken() throws IOException {
+ // get the next piece of input
+ if ( savedState != null ) {
+ restoreState( savedState );
+ savedState = null;
+ saveTermBuffer();
+ return true;
+ }
+ else if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ /* We build n-grams before and after stopwords.
+ * When valid, the buffer always contains at least the separator.
+ * If its empty, there is nothing before this stopword.
+ */
+ if ( lastWasCommon || ( isCommon() && buffer.length() > 0 ) ) {
+ savedState = captureState();
+ gramToken();
+ return true;
+ }
+
+ saveTermBuffer();
+ return true;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ lastWasCommon = false;
+ savedState = null;
+ buffer.setLength( 0 );
+ }
+
+ // ================================================= Helper Methods
================================================
+
+ /**
+ * Determines if the current token is a common term
+ *
+ * @return {@code true} if the current token is a common term, {@code false} otherwise
+ */
+ private boolean isCommon() {
+ return commonWords != null && commonWords.contains( termAttribute.termBuffer(),
0, termAttribute.termLength() );
+ }
+
+ /**
+ * Saves this information to form the left part of a gram
+ */
+ private void saveTermBuffer() {
+ buffer.setLength( 0 );
+ buffer.append( termAttribute.termBuffer(), 0, termAttribute.termLength() );
+ buffer.append( SEPARATOR );
+ lastStartOffset = offsetAttribute.startOffset();
+ lastWasCommon = isCommon();
+ }
+
+ /**
+ * Constructs a compound token.
+ */
+ private void gramToken() {
+ buffer.append( termAttribute.termBuffer(), 0, termAttribute.termLength() );
+ int endOffset = offsetAttribute.endOffset();
+
+ clearAttributes();
+
+ int length = buffer.length();
+ char termText[] = termAttribute.termBuffer();
+ if ( length > termText.length ) {
+ termText = termAttribute.resizeTermBuffer( length );
+ }
+
+ buffer.getChars( 0, length, termText, 0 );
+ termAttribute.setTermLength( length );
+ posIncAttribute.setPositionIncrement( 0 );
+ offsetAttribute.setOffset( lastStartOffset, endOffset );
+ typeAttribute.setType( GRAM_TYPE );
+ buffer.setLength( 0 );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Constructs a CommonGramsFilter
+ */
+
+/*
+ * This is pretty close to a straight copy from StopFilterFactory
+ */
+public class CommonGramsFilterFactory extends BaseTokenFilterFactory implements
+ ResourceLoaderAware {
+
+ public void inform(ResourceLoader loader) {
+ String commonWordFiles = args.get( "words" );
+ ignoreCase = getBoolean( "ignoreCase", false );
+
+ if ( commonWordFiles != null ) {
+ try {
+ commonWords = getWordSet( loader, commonWordFiles, ignoreCase );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ else {
+ commonWords = ( CharArraySet ) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ }
+ }
+
+ //Force the use of a char array set, as it is the most performant, although this may
break things if Lucene ever goes away from it. See SOLR-1095
+ private CharArraySet commonWords;
+ private boolean ignoreCase;
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public Set<?> getCommonWords() {
+ return commonWords;
+ }
+
+ public CommonGramsFilter create(TokenStream input) {
+ CommonGramsFilter commonGrams = new CommonGramsFilter( luceneMatchVersion, input,
commonWords, ignoreCase );
+ return commonGrams;
+ }
+}
+
+
+
\ No newline at end of file
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Wrap a CommonGramsFilter optimizing phrase queries by only returning single
+ * words when they are not a member of a bigram.
+ * <p/>
+ * Example:
+ * <ul>
+ * <li>query input to CommonGramsFilter: "the rain in spain falls
mainly"
+ * <li>output of CommomGramsFilter/input to CommonGramsQueryFilter:
+ * |"the, "the-rain"|"rain" "rain-in"|"in,
"in-spain"|"spain"|"falls"|"mainly"
+ * <li>output of CommonGramsQueryFilter:"the-rain", "rain-in"
,"in-spain",
+ * "falls", "mainly"
+ * </ul>
+ */
+
+/*
+ * TODO: When org.apache.solr.analysis.BufferedTokenStream is changed to use the
+ * 2.9 lucene TokenStream api, make necessary changes here.
+ * See:http://hudson.zones
+ * .apache.org/hudson/job/Lucene-trunk/javadoc//all/org/apache
+ * /lucene/analysis/TokenStream.html and
+ *
http://svn.apache.org/viewvc/lucene/java
+ * /trunk/src/java/org/apache/lucene/analysis/package.html?revision=718798
+ */
+public class CommonGramsQueryFilter extends BufferedTokenStream {
+ private Token prev;
+
+ /**
+ * Constructor
+ *
+ * @param input must be a CommonGramsFilter!
+ */
+
+ public CommonGramsQueryFilter(CommonGramsFilter input) {
+ super( input );
+ prev = new Token();
+ }
+
+ public void reset() throws IOException {
+ super.reset();
+ prev = new Token();
+ }
+
+ /**
+ * Output bigrams whenever possible to optimize queries. Only output unigrams
+ * when they are not a member of a bigram. Example:
+ * <ul>
+ * <li>input: "the rain in spain falls mainly"
+ * <li>output:"the-rain", "rain-in" ,"in-spain",
"falls", "mainly"
+ */
+
+ public Token process(Token token) throws IOException {
+ Token next = peek( 1 );
+ /*
+ * Deal with last token (next=null when current token is the last word) Last
+ * token will be a unigram. If previous token was a bigram, then we already
+ * output the last token as part of the unigram and should not additionally
+ * output the unigram. <p> Example: If the end of the input to the
+ * CommonGramsFilter is "...the plain" <ul> <li>current token =
"plain"</li>
+ * <li>next token = null</li> <li>previous token =
"the-plain" (bigram)</li>
+ * <li> Since the word "plain" was already output as part of the
bigram we
+ * don't output it.</li> </ul> Example: If the end of the input to
the
+ * CommonGramsFilter is "falls mainly" <ul> <li>current token =
+ * "mainly"</li> <li>next token = null</li>
<li>previous token = "falls"
+ * (unigram)</li> <li>Since we haven't yet output the current token,
we
+ * output it</li> </ul>
+ */
+
+ // Deal with special case of last token
+ if ( next == null ) {
+ if ( prev == null ) {
+ // This is the first and only token i.e. one word query
+ return token;
+ }
+ if ( prev != null && prev.type() != "gram" ) {
+ // If previous token was a unigram, output the current token
+ return token;
+ }
+ else {
+ // If previous token was a bigram, we already output it and this token
+ // was output as part of the bigram so we are done.
+ return null;
+ }
+ }
+
+ /*
+ * Possible cases are: |token |next 1|word |gram 2|word |word The
+ * CommonGramsFilter we are wrapping always outputs the unigram word prior
+ * to outputting an optional bigram: "the sound of" gets output as
|"the",
+ * "the_sound"|"sound", "sound_of" For case 1 we consume
the gram from the
+ * input stream and output it rather than the current token This means that
+ * the call to super.next() which reads a token from input and passes it on
+ * to this process method will always get a token of type word
+ */
+ if ( next != null && next.type() == "gram" ) {
+ // consume "next" token from list and output it
+ token = read();
+ // use this to clone the token because clone requires all these args but
+ // won't take the token.type
+ // see
+ //
http://hudson.zones.apache.org/hudson/job/Lucene-trunk/javadoc//all/org/a...
+ prev.reinit(
+ token.termBuffer(), 0, token.termLength(), token
+ .startOffset(), token.endOffset(), token.type()
+ );
+ token.setPositionIncrement( 1 );
+ return token;
+ }
+
+ // if the next token is not a bigram, then output the token
+ // see note above regarding this method of copying token to prev
+ prev.reinit(
+ token.termBuffer(), 0, token.termLength(), token.startOffset(),
+ token.endOffset(), token.type()
+ );
+ assert token.type() == "word";
+ return token;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * Construct CommonGramsQueryFilter
+ * <p/>
+ * This is pretty close to a straight copy from StopFilterFactory
+ */
+public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory
+ implements ResourceLoaderAware {
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ assureMatchVersion();
+ }
+
+ public void inform(ResourceLoader loader) {
+ String commonWordFiles = args.get( "words" );
+ ignoreCase = getBoolean( "ignoreCase", false );
+
+ if ( commonWordFiles != null ) {
+ try {
+ commonWords = getWordSet( loader, commonWordFiles, ignoreCase );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ else {
+ commonWords = ( CharArraySet ) StopAnalyzer.ENGLISH_STOP_WORDS_SET;
+ }
+ }
+
+ // Force the use of a char array set, as it is the most performant, although
+ // this may break things if Lucene ever goes away from it. See SOLR-1095
+ private CharArraySet commonWords;
+
+ private boolean ignoreCase;
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public Set<?> getCommonWords() {
+ return commonWords;
+ }
+
+ /**
+ * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
+ */
+ public CommonGramsQueryFilter create(TokenStream input) {
+ CommonGramsFilter commonGrams = new CommonGramsFilter(
+ luceneMatchVersion, input, commonWords,
+ ignoreCase
+ );
+ CommonGramsQueryFilter commonGramsQuery = new CommonGramsQueryFilter(
+ commonGrams
+ );
+ return commonGramsQuery;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,72 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.payloads.DelimitedPayloadTokenFilter;
+import org.apache.lucene.analysis.payloads.FloatEncoder;
+import org.apache.lucene.analysis.payloads.IdentityEncoder;
+import org.apache.lucene.analysis.payloads.IntegerEncoder;
+import org.apache.lucene.analysis.payloads.PayloadEncoder;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+ public static final String ENCODER_ATTR = "encoder";
+ public static final String DELIMITER_ATTR = "delimiter";
+
+ private PayloadEncoder encoder;
+ private char delimiter = '|';
+
+ public DelimitedPayloadTokenFilter create(TokenStream input) {
+ return new DelimitedPayloadTokenFilter( input, delimiter, encoder );
+ }
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ }
+
+ public void inform(ResourceLoader loader) {
+ String encoderClass = args.get( ENCODER_ATTR );
+ if ( encoderClass.equals( "float" ) ) {
+ encoder = new FloatEncoder();
+ }
+ else if ( encoderClass.equals( "integer" ) ) {
+ encoder = new IntegerEncoder();
+ }
+ else if ( encoderClass.equals( "identity" ) ) {
+ encoder = new IdentityEncoder();
+ }
+ else {
+ encoder = ( PayloadEncoder ) loader.newInstance( encoderClass );
+ }
+
+ String delim = args.get( DELIMITER_ATTR );
+ if ( delim != null ) {
+ if ( delim.length() == 1 ) {
+ delimiter = delim.charAt( 0 );
+ }
+ else {
+ throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "Delimiter must
be one character only" );
+ }
+ }
+ }
+}
\ No newline at end of file
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
+import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory
implements ResourceLoaderAware {
+ private Set dictionary;
+ private String dictFile;
+ private int minWordSize;
+ private int minSubwordSize;
+ private int maxSubwordSize;
+ private boolean onlyLongestMatch;
+
+ public void init(Map<String, String> args) {
+ super.init( args );
+ dictFile = args.get( "dictionary" );
+ if ( null == dictFile ) {
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR,
+ "Missing required parameter: dictionary"
+ );
+ }
+
+ minWordSize = getInt( "minWordSize",
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE );
+ minSubwordSize = getInt( "minSubwordSize",
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE );
+ maxSubwordSize = getInt( "maxSubwordSize",
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE );
+ onlyLongestMatch = getBoolean( "onlyLongestMatch", true );
+ }
+
+ public void inform(ResourceLoader loader) {
+ try {
+ List<String> wlist = loader.getLines( dictFile );
+ dictionary = StopFilter.makeStopSet( ( String[] ) wlist.toArray( new String[0] ),
false );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+
+ public DictionaryCompoundWordTokenFilter create(TokenStream input) {
+ return new DictionaryCompoundWordTokenFilter(
+ input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch
+ );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+public class DoubleMetaphoneFilter extends TokenFilter {
+
+ private static final String TOKEN_TYPE = "DoubleMetaphone";
+
+ private final LinkedList<State> remainingTokens = new LinkedList<State>();
+ private final DoubleMetaphone encoder = new DoubleMetaphone();
+ private final boolean inject;
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posAtt;
+
+ protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
+ super( input );
+ this.encoder.setMaxCodeLen( maxCodeLength );
+ this.inject = inject;
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ this.posAtt = ( PositionIncrementAttribute ) addAttribute(
PositionIncrementAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ for (; ; ) {
+
+ if ( !remainingTokens.isEmpty() ) {
+ // clearAttributes(); // not currently necessary
+ restoreState( remainingTokens.removeFirst() );
+ return true;
+ }
+
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ int len = termAtt.termLength();
+ if ( len == 0 ) {
+ return true;
+ } // pass through zero length terms
+
+ int firstAlternativeIncrement = inject ? 0 : posAtt.getPositionIncrement();
+
+ String v = new String( termAtt.termBuffer(), 0, len );
+ String primaryPhoneticValue = encoder.doubleMetaphone( v );
+ String alternatePhoneticValue = encoder.doubleMetaphone( v, true );
+
+ // a flag to lazily save state if needed... this avoids a save/restore when only
+ // one token will be generated.
+ boolean saveState = inject;
+
+ if ( primaryPhoneticValue != null && primaryPhoneticValue.length() > 0
&& !primaryPhoneticValue.equals( v ) ) {
+ if ( saveState ) {
+ remainingTokens.addLast( captureState() );
+ }
+ posAtt.setPositionIncrement( firstAlternativeIncrement );
+ firstAlternativeIncrement = 0;
+ termAtt.setTermBuffer( primaryPhoneticValue );
+ saveState = true;
+ }
+
+ if ( alternatePhoneticValue != null && alternatePhoneticValue.length() > 0
+ && !alternatePhoneticValue.equals( primaryPhoneticValue )
+ && !primaryPhoneticValue.equals( v ) ) {
+ if ( saveState ) {
+ remainingTokens.addLast( captureState() );
+ saveState = false;
+ }
+ posAtt.setPositionIncrement( firstAlternativeIncrement );
+ termAtt.setTermBuffer( alternatePhoneticValue );
+ saveState = true;
+ }
+
+ // Just one token to return, so no need to capture/restore
+ // any state, simply return it.
+ if ( remainingTokens.isEmpty() ) {
+ return true;
+ }
+
+ if ( saveState ) {
+ remainingTokens.addLast( captureState() );
+ }
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ remainingTokens.clear();
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+
+public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory {
+ public static final String INJECT = "inject";
+ public static final String MAX_CODE_LENGTH = "maxCodeLength";
+
+ public static final int DEFAULT_MAX_CODE_LENGTH = 4;
+
+ private boolean inject = true;
+ private int maxCodeLength = DEFAULT_MAX_CODE_LENGTH;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+
+ inject = getBoolean( INJECT, true );
+
+ if ( args.get( MAX_CODE_LENGTH ) != null ) {
+ maxCodeLength = Integer.parseInt( args.get( MAX_CODE_LENGTH ) );
+ }
+ }
+
+ public DoubleMetaphoneFilter create(TokenStream input) {
+ return new DoubleMetaphoneFilter( input, maxCodeLength, inject );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DutchStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DutchStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/DutchStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.nl.DutchStemFilter;
+
+public class DutchStemFilterFactory extends BaseTokenFilterFactory {
+ public DutchStemFilter create(TokenStream _in) {
+ return new DutchStemFilter( _in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,55 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
+
+/**
+ * Creates new instances of {@link EdgeNGramTokenFilter}.
+ */
+public class EdgeNGramFilterFactory extends BaseTokenFilterFactory {
+ private int maxGramSize = 0;
+
+ private int minGramSize = 0;
+
+ private String side;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ String maxArg = args.get( "maxGramSize" );
+ maxGramSize = ( maxArg != null ? Integer.parseInt( maxArg )
+ : EdgeNGramTokenFilter.DEFAULT_MAX_GRAM_SIZE );
+
+ String minArg = args.get( "minGramSize" );
+ minGramSize = ( minArg != null ? Integer.parseInt( minArg )
+ : EdgeNGramTokenFilter.DEFAULT_MIN_GRAM_SIZE );
+
+ side = args.get( "side" );
+ if ( side == null ) {
+ side = EdgeNGramTokenFilter.Side.FRONT.getLabel();
+ }
+ }
+
+ public EdgeNGramTokenFilter create(TokenStream input) {
+ return new EdgeNGramTokenFilter( input, side, minGramSize, maxGramSize );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,53 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
+
+/**
+ * Creates new instances of {@link EdgeNGramTokenizer}.
+ */
+public class EdgeNGramTokenizerFactory extends BaseTokenizerFactory {
+ private int maxGramSize = 0;
+
+ private int minGramSize = 0;
+
+ private String side;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ String maxArg = args.get( "maxGramSize" );
+ maxGramSize = ( maxArg != null ? Integer.parseInt( maxArg ) :
EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE );
+
+ String minArg = args.get( "minGramSize" );
+ minGramSize = ( minArg != null ? Integer.parseInt( minArg ) :
EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE );
+
+ side = args.get( "side" );
+ if ( side == null ) {
+ side = EdgeNGramTokenizer.Side.FRONT.getLabel();
+ }
+ }
+
+ public EdgeNGramTokenizer create(Reader input) {
+ return new EdgeNGramTokenizer( input, side, minGramSize, maxGramSize );
+ }
+}
Property changes on:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
___________________________________________________________________
Name: svn:executable
+ *
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ElisionFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ElisionFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ElisionFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.fr.ElisionFilter;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+public class ElisionFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+
+ private Set articles;
+
+ public void inform(ResourceLoader loader) {
+ String articlesFile = args.get( "articles" );
+
+ if ( articlesFile != null ) {
+ try {
+ List<String> wlist = loader.getLines( articlesFile );
+ articles = StopFilter.makeStopSet( ( String[] ) wlist.toArray( new String[0] ), false
);
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ else {
+ throw new RuntimeException( "No articles specified for ElisionFilterFactory"
);
+ }
+ }
+
+ public ElisionFilter create(TokenStream input) {
+ return new ElisionFilter( input, articles );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/EnglishPorterFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @version $Id: EnglishPorterFilterFactory.java 804726 2009-08-16 17:28:58Z yonik $
+ * @deprecated Use SnowballPorterFilterFactory with language="English" instead
+ */
+public class EnglishPorterFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+ public static final String PROTECTED_TOKENS = "protected";
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get( PROTECTED_TOKENS );
+ if ( wordFiles != null ) {
+ try {
+ File protectedWordFiles = new File( wordFiles );
+ if ( protectedWordFiles.exists() ) {
+ List<String> wlist = loader.getLines( wordFiles );
+ //This cast is safe in Lucene
+ protectedWords = new CharArraySet(
+ wlist, false
+ );//No need to go through StopFilter as before, since it just uses a List
internally
+ }
+ else {
+ List<String> files = StrUtils.splitFileNames( wordFiles );
+ for ( String file : files ) {
+ List<String> wlist = loader.getLines( file.trim() );
+ if ( protectedWords == null ) {
+ protectedWords = new CharArraySet( wlist, false );
+ }
+ else {
+ protectedWords.addAll( wlist );
+ }
+ }
+ }
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ }
+
+ private CharArraySet protectedWords = null;
+
+ public EnglishPorterFilter create(TokenStream input) {
+ return new EnglishPorterFilter( input, protectedWords );
+ }
+
+}
+
+
+/**
+ * English Porter2 filter that doesn't use reflection to
+ * adapt lucene to the snowball stemmer code.
+ */
+@Deprecated
+class EnglishPorterFilter extends SnowballPorterFilter {
+ public EnglishPorterFilter(TokenStream source, CharArraySet protWords) {
+ super( source, new org.tartarus.snowball.ext.EnglishStemmer(), protWords );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/FrenchStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/FrenchStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/FrenchStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.fr.FrenchStemFilter;
+
+public class FrenchStemFilterFactory extends BaseTokenFilterFactory {
+ public FrenchStemFilter create(TokenStream in) {
+ return new FrenchStemFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GermanStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GermanStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GermanStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.de.GermanStemFilter;
+
+public class GermanStemFilterFactory extends BaseTokenFilterFactory {
+ public GermanStemFilter create(TokenStream in) {
+ return new GermanStemFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,50 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.el.GreekLowerCaseFilter;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+/**
+ * Factory for {@link GreekLowerCaseFilter}
+ */
+public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ assureMatchVersion();
+ if ( args.containsKey( "charset" ) ) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "The charset parameter is no longer supported. "
+ + "Please process your documents as Unicode instead."
+ );
+ }
+ }
+
+ public GreekLowerCaseFilter create(TokenStream in) {
+ return new GreekLowerCaseFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,1957 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashMap;
+import java.util.Set;
+
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
+ *
+ * @version $Id: HTMLStripCharFilter.java 826299 2009-10-17 19:56:01Z yonik $
+ */
+public class HTMLStripCharFilter extends BaseCharFilter {
+ private int readAheadLimit = DEFAULT_READ_AHEAD;
+ private int safeReadAheadLimit = readAheadLimit - 3;
+ private int numWhitespace = 0;
+ private int numRead = 0;
+ private int numEaten = 0;
+ private int numReturned = 0;
+ private int lastMark;
+ private Set<String> escapedTags;
+
+ // pushback buffer
+ private final StringBuilder pushed = new StringBuilder();
+ private static final int EOF = -1;
+ private static final int MISMATCH = -2;
+
+ private static final int MATCH = -3;
+ // temporary buffer
+ private final StringBuilder sb = new StringBuilder();
+ public static final int DEFAULT_READ_AHEAD = 8192;
+
+
+ public static void main(String[] args) throws IOException {
+ Reader in = new HTMLStripCharFilter(
+ CharReader.get( new InputStreamReader( System.in ) )
+ );
+ int ch;
+ while ( ( ch = in.read() ) != -1 ) {
+ System.out.print( ( char ) ch );
+ }
+ }
+
+ public HTMLStripCharFilter(CharStream source) {
+ super( source.markSupported() ? source : CharReader.get( new BufferedReader( source ) )
);
+ }
+
+ public HTMLStripCharFilter(CharStream source, Set<String> escapedTags) {
+ this( source );
+ this.escapedTags = escapedTags;
+ }
+
+ public HTMLStripCharFilter(CharStream source, Set<String> escapedTags, int
readAheadLimit) {
+ this( source );
+ this.escapedTags = escapedTags;
+ this.readAheadLimit = readAheadLimit;
+ safeReadAheadLimit = readAheadLimit - 3;
+ }
+
+ public int getReadAheadLimit() {
+ return readAheadLimit;
+ }
+
+ private int next() throws IOException {
+ int len = pushed.length();
+ if ( len > 0 ) {
+ int ch = pushed.charAt( len - 1 );
+ pushed.setLength( len - 1 );
+ return ch;
+ }
+ numRead++;
+ return input.read();
+ }
+
+ private int nextSkipWS() throws IOException {
+ int ch = next();
+ while ( isSpace( ch ) ) {
+ ch = next();
+ }
+ return ch;
+ }
+
+ private int peek() throws IOException {
+ int len = pushed.length();
+ if ( len > 0 ) {
+ return pushed.charAt( len - 1 );
+ }
+ int ch = input.read();
+ push( ch );
+ return ch;
+ }
+
+ private void push(int ch) {
+ pushed.append( ( char ) ch );
+ }
+
+
+ private boolean isSpace(int ch) {
+ switch ( ch ) {
+ case ' ':
+ case '\n':
+ case '\r':
+ case '\t':
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ private boolean isHex(int ch) {
+ return ( ch >= '0' && ch <= '9' ) ||
+ ( ch >= 'A' && ch <= 'Z' ) ||
+ ( ch >= 'a' && ch <= 'z' );
+ }
+
+ private boolean isAlpha(int ch) {
+ return ch >= 'a' && ch <= 'z' || ch >= 'A'
&& ch <= 'Z';
+ }
+
+ private boolean isDigit(int ch) {
+ return ch >= '0' && ch <= '9';
+ }
+
+ /**
+ * From HTML 4.0
+ * [4] NameChar ::= Letter | Digit | '.' | '-' | '_'
| ':' | CombiningChar | Extender
+ * [5] Name ::= (Letter | '_' | ':') (NameChar)*
+ * [6] Names ::= Name (#x20 Name)*
+ * [7] Nmtoken ::= (NameChar)+
+ * [8] Nmtokens ::= Nmtoken (#x20 Nmtoken)*
+ * *
+ */
+
+ // should I include all id chars allowable by HTML/XML here?
+ // including accented chars, ':', etc?
+ private boolean isIdChar(int ch) {
+ // return Character.isUnicodeIdentifierPart(ch);
+ // isUnicodeIdentiferPart doesn't include '-'... shoudl I still
+ // use it and add in '-',':',etc?
+ return isAlpha( ch ) || isDigit( ch ) || ch == '.' ||
+ ch == '-' || ch == '_' || ch == ':'
+ || Character.isLetter( ch );
+
+ }
+
+ private boolean isFirstIdChar(int ch) {
+ return Character.isUnicodeIdentifierStart( ch );
+ // return isAlpha(ch) || ch=='_' || Character.isLetter(ch);
+ }
+
+
+ private void saveState() throws IOException {
+ lastMark = numRead;
+ input.mark( readAheadLimit );
+ }
+
+ private void restoreState() throws IOException {
+ input.reset();
+ pushed.setLength( 0 );
+ }
+
+ private int readNumericEntity() throws IOException {
+ // "&#" has already been read at this point
+ int eaten = 2;
+
+ // is this decimal, hex, or nothing at all.
+ int ch = next();
+ int base = 10;
+ boolean invalid = false;
+ sb.setLength( 0 );
+
+ if ( isDigit( ch ) ) {
+ // decimal character entity
+ sb.append( ( char ) ch );
+ for ( int i = 0; i < 10; i++ ) {
+ ch = next();
+ if ( isDigit( ch ) ) {
+ sb.append( ( char ) ch );
+ }
+ else {
+ break;
+ }
+ }
+ }
+ else if ( ch == 'x' ) {
+ eaten++;
+ // hex character entity
+ base = 16;
+ sb.setLength( 0 );
+ for ( int i = 0; i < 10; i++ ) {
+ ch = next();
+ if ( isHex( ch ) ) {
+ sb.append( ( char ) ch );
+ }
+ else {
+ break;
+ }
+ }
+ }
+ else {
+ return MISMATCH;
+ }
+
+
+ // In older HTML, an entity may not have always been terminated
+ // with a semicolon. We'll also treat EOF or whitespace as terminating
+ // the entity.
+ try {
+ if ( ch == ';' || ch == -1 ) {
+ // do not account for the eaten ";" due to the fact that we do output a
char
+ numWhitespace = sb.length() + eaten;
+ return Integer.parseInt( sb.toString(), base );
+ }
+
+ // if whitespace terminated the entity, we need to return
+ // that whitespace on the next call to read().
+ if ( isSpace( ch ) ) {
+ push( ch );
+ numWhitespace = sb.length() + eaten;
+ return Integer.parseInt( sb.toString(), base );
+ }
+ }
+ catch ( NumberFormatException e ) {
+ return MISMATCH;
+ }
+
+ // Not an entity...
+ return MISMATCH;
+ }
+
+ private int readEntity() throws IOException {
+ int ch = next();
+ if ( ch == '#' ) {
+ return readNumericEntity();
+ }
+
+ //read an entity reference
+
+ // for an entity reference, require the ';' for safety.
+ // otherwise we may try and convert part of some company
+ // names to an entity. "Alpha&Beta Corp" for instance.
+ //
+ // TODO: perhaps I should special case some of the
+ // more common ones like & to make the ';' optional...
+
+ sb.setLength( 0 );
+ sb.append( ( char ) ch );
+
+ for ( int i = 0; i < safeReadAheadLimit; i++ ) {
+ ch = next();
+ if ( Character.isLetter( ch ) ) {
+ sb.append( ( char ) ch );
+ }
+ else {
+ break;
+ }
+ }
+
+ if ( ch == ';' ) {
+ String entity = sb.toString();
+ Character entityChar = entityTable.get( entity );
+ if ( entityChar != null ) {
+ numWhitespace = entity.length() + 1;
+ return entityChar.charValue();
+ }
+ }
+
+ return MISMATCH;
+ }
+
+ /**
+ * valid comments according to HTML specs
+ * <!-- Hello -->
+ * <!-- Hello -- -- Hello-->
+ * <!---->
+ * <!------ Hello -->
+ * <!>
+ * <!------> Hello -->
+ * <p/>
+ * #comments inside of an entity decl:
+ * <!ENTITY amp CDATA "&" -- ampersand, U+0026 ISOnum -->
+ * <p/>
+ * Turns out, IE & mozilla don't parse comments correctly.
+ * Since this is meant to be a practical stripper, I'll just
+ * try and duplicate what the browsers do.
+ * <p/>
+ * <!-- (stuff_including_markup)* -->
+ * <!FOO (stuff, not including markup) >
+ * <! (stuff, not including markup)* >
+ * <p/>
+ * <p/>
+ * *
+ */
+
+ private int readBang(boolean inScript) throws IOException {
+ // at this point, "<!" has been read
+ int ret = readComment( inScript );
+ if ( ret == MATCH ) {
+ return MATCH;
+ }
+
+ if ( ( numRead - lastMark ) < safeReadAheadLimit || peek() == '>' ) {
+
+ int ch = next();
+ if ( ch == '>' ) {
+ return MATCH;
+ }
+
+ // if it starts with <! and isn't a comment,
+ // simply read until ">"
+ //since we did readComment already, it may be the case that we are already deep into
the read ahead buffer
+ //so, we may need to abort sooner
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ ch = next();
+ if ( ch == '>' ) {
+ return MATCH;
+ }
+ else if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ }
+ }
+ return MISMATCH;
+ }
+
+ // tries to read comments the way browsers do, not
+ // strictly by the standards.
+ //
+ // GRRRR. it turns out that in the wild, a <script> can have a HTML comment
+ // that contains a script that contains a quoted comment.
+ // <script><!-- document.write("<!--embedded comment-->")
--></script>
+ //
+
+ private int readComment(boolean inScript) throws IOException {
+ // at this point "<!" has been read
+ int ch = next();
+ if ( ch != '-' ) {
+ // not a comment
+ push( ch );
+ return MISMATCH;
+ }
+
+ ch = next();
+ if ( ch != '-' ) {
+ // not a comment
+ push( ch );
+ push( '-' );
+ return MISMATCH;
+ }
+ /*two extra calls to next() here, so make sure we don't read past our mark*/
+ while ( ( numRead - lastMark ) < safeReadAheadLimit - 3 ) {
+ ch = next();
+ if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ if ( ch == '-' ) {
+ ch = next();
+ if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ if ( ch != '-' ) {
+ push( ch );
+ continue;
+ }
+
+ ch = next();
+ if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ if ( ch != '>' ) {
+ push( ch );
+ push( '-' );
+ continue;
+ }
+
+ return MATCH;
+ }
+ else if ( ( ch == '\'' || ch == '"' ) && inScript )
{
+ push( ch );
+ int ret = readScriptString();
+ // if this wasn't a string, there's not much we can do
+ // at this point without having a stack of stream states in
+ // order to "undo" just the latest.
+ }
+ else if ( ch == '<' ) {
+ eatSSI();
+ }
+
+ }
+ return MISMATCH;
+
+ }
+
+
+ private int readTag() throws IOException {
+ // at this point '<' has already been read
+ int ch = next();
+ if ( !isAlpha( ch ) ) {
+ push( ch );
+ return MISMATCH;
+ }
+
+ sb.setLength( 0 );
+ sb.append( ( char ) ch );
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+
+ ch = next();
+ if ( isIdChar( ch ) ) {
+ sb.append( ( char ) ch );
+ }
+ else if ( ch == '/' ) {
+ // Hmmm, a tag can close with "/>" as well as "/ >"
+ // read end tag '/>' or '/ >', etc
+ return nextSkipWS() == '>' ? MATCH : MISMATCH;
+ }
+ else {
+ break;
+ }
+ }
+ if ( escapedTags != null && escapedTags.contains( sb.toString() ) ) {
+ //if this is a reservedTag, then keep it
+ return MISMATCH;
+ }
+ // After the tag id, there needs to be either whitespace or
+ // '>'
+ if ( !( ch == '>' || isSpace( ch ) ) ) {
+ return MISMATCH;
+ }
+
+ if ( ch != '>' ) {
+ // process attributes
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ ch = next();
+ if ( isSpace( ch ) ) {
+ continue;
+ }
+ else if ( isFirstIdChar( ch ) ) {
+ push( ch );
+ int ret = readAttr2();
+ if ( ret == MISMATCH ) {
+ return ret;
+ }
+ }
+ else if ( ch == '/' ) {
+ // read end tag '/>' or '/ >', etc
+ return nextSkipWS() == '>' ? MATCH : MISMATCH;
+ }
+ else if ( ch == '>' ) {
+ break;
+ }
+ else {
+ return MISMATCH;
+ }
+
+ }
+ if ( ( numRead - lastMark ) >= safeReadAheadLimit ) {
+ return MISMATCH;//exit out if we exceeded the buffer
+ }
+ }
+
+ // We only get to this point after we have read the
+ // entire tag. Now let's see if it's a special tag.
+ String name = sb.toString();
+ if ( name.equalsIgnoreCase( "script" ) || name.equalsIgnoreCase(
"style" ) ) {
+ // The content of script and style elements is
+ // CDATA in HTML 4 but PCDATA in XHTML.
+
+ /* From HTML4:
+ Although the STYLE and SCRIPT elements use CDATA for their data model,
+ for these elements, CDATA must be handled differently by user agents.
+ Markup and entities must be treated as raw text and passed to the application
+ as is. The first occurrence of the character sequence "</" (end-tag
open
+ delimiter) is treated as terminating the end of the element's content. In
+ valid documents, this would be the end tag for the element.
+ */
+
+ // discard everything until endtag is hit (except
+ // if it occurs in a comment.
+
+ // reset the stream mark to here, since we know that we sucessfully matched
+ // a tag, and if we can't find the end tag, this is where we will want
+ // to roll back to.
+ saveState();
+ pushed.setLength( 0 );
+ return findEndTag();
+ }
+ return MATCH;
+ }
+
+
+ // find an end tag, but beware of comments...
+ // <script><!-- </script> -->foo</script>
+ // beware markup in script strings:
</script>...document.write("</script>")foo</script>
+ // TODO: do I need to worry about CDATA sections "<![CDATA[" ?
+
+ int findEndTag() throws IOException {
+
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ int ch = next();
+ if ( ch == '<' ) {
+ ch = next();
+ // skip looking for end-tag in comments
+ if ( ch == '!' ) {
+ int ret = readBang( true );
+ if ( ret == MATCH ) {
+ continue;
+ }
+ // yikes... what now? It wasn't a comment, but I can't get
+ // back to the state I was at. Just continue from where I
+ // am I guess...
+ continue;
+ }
+ // did we match "</"
+ if ( ch != '/' ) {
+ push( ch );
+ continue;
+ }
+ int ret = readName( false );
+ if ( ret == MISMATCH ) {
+ return MISMATCH;
+ }
+ ch = nextSkipWS();
+ if ( ch != '>' ) {
+ return MISMATCH;
+ }
+ return MATCH;
+ }
+ else if ( ch == '\'' || ch == '"' ) {
+ // read javascript string to avoid a false match.
+ push( ch );
+ int ret = readScriptString();
+ // what to do about a non-match (non-terminated string?)
+ // play it safe and index the rest of the data I guess...
+ if ( ret == MISMATCH ) {
+ return MISMATCH;
+ }
+ }
+ else if ( ch < 0 ) {
+ return MISMATCH;
+ }
+
+ }
+ return MISMATCH;
+ }
+
+
+ // read a string escaped by backslashes
+
+ private int readScriptString() throws IOException {
+ int quoteChar = next();
+ if ( quoteChar != '\'' && quoteChar != '"' ) {
+ return MISMATCH;
+ }
+
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ int ch = next();
+ if ( ch == quoteChar ) {
+ return MATCH;
+ }
+ else if ( ch == '\\' ) {
+ ch = next();
+ }
+ else if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ else if ( ch == '<' ) {
+ eatSSI();
+ }
+
+ }
+ return MISMATCH;
+ }
+
+
+ private int readName(boolean checkEscaped) throws IOException {
+ StringBuilder builder = ( checkEscaped && escapedTags != null ) ? new
StringBuilder() : null;
+ int ch = next();
+ if ( builder != null ) {
+ builder.append( ( char ) ch );
+ }
+ if ( !isFirstIdChar( ch ) ) {
+ return MISMATCH;
+ }
+ ch = next();
+ if ( builder != null ) {
+ builder.append( ( char ) ch );
+ }
+ while ( isIdChar( ch ) ) {
+ ch = next();
+ if ( builder != null ) {
+ builder.append( ( char ) ch );
+ }
+ }
+ if ( ch != -1 ) {
+ push( ch );
+
+ }
+ //strip off the trailing >
+ if ( builder != null && escapedTags.contains( builder.substring( 0,
builder.length() - 1 ) ) ) {
+ return MISMATCH;
+ }
+ return MATCH;
+ }
+
+ /**
+ * [10] AttValue ::= '"' ([^<&"] | Reference)*
'"'
+ * | "'" ([^<&'] | Reference)* "'"
+ * <p/>
+ * need to also handle unquoted attributes, and attributes w/o values:
+ * <td id=msviGlobalToolbar height="22" nowrap align=left>
+ * <p/>
+ * *
+ */
+
+ // This reads attributes and attempts to handle any
+ // embedded server side includes that would otherwise
+ // mess up the quote handling.
+ // <a href="a/<!--#echo "path"-->">
+ private int readAttr2() throws IOException {
+ if ( ( numRead - lastMark < safeReadAheadLimit ) ) {
+ int ch = next();
+ if ( !isFirstIdChar( ch ) ) {
+ return MISMATCH;
+ }
+ ch = next();
+ while ( isIdChar( ch ) && ( ( numRead - lastMark ) < safeReadAheadLimit ) )
{
+ ch = next();
+ }
+ if ( isSpace( ch ) ) {
+ ch = nextSkipWS();
+ }
+
+ // attributes may not have a value at all!
+ // if (ch != '=') return MISMATCH;
+ if ( ch != '=' ) {
+ push( ch );
+ return MATCH;
+ }
+
+ int quoteChar = nextSkipWS();
+
+ if ( quoteChar == '"' || quoteChar == '\'' ) {
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ ch = next();
+ if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ else if ( ch == '<' ) {
+ eatSSI();
+ }
+ else if ( ch == quoteChar ) {
+ return MATCH;
+ //} else if (ch=='<') {
+ // return MISMATCH;
+ }
+
+ }
+ }
+ else {
+ // unquoted attribute
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ ch = next();
+ if ( ch < 0 ) {
+ return MISMATCH;
+ }
+ else if ( isSpace( ch ) ) {
+ push( ch );
+ return MATCH;
+ }
+ else if ( ch == '>' ) {
+ push( ch );
+ return MATCH;
+ }
+ else if ( ch == '<' ) {
+ eatSSI();
+ }
+
+ }
+ }
+ }
+ return MISMATCH;
+ }
+
+ // skip past server side include
+
+ private int eatSSI() throws IOException {
+ // at this point, only a "<" was read.
+ // on a mismatch, push back the last char so that if it was
+ // a quote that closes the attribute, it will be re-read and matched.
+ int ch = next();
+ if ( ch != '!' ) {
+ push( ch );
+ return MISMATCH;
+ }
+ ch = next();
+ if ( ch != '-' ) {
+ push( ch );
+ return MISMATCH;
+ }
+ ch = next();
+ if ( ch != '-' ) {
+ push( ch );
+ return MISMATCH;
+ }
+ ch = next();
+ if ( ch != '#' ) {
+ push( ch );
+ return MISMATCH;
+ }
+
+ push( '#' );
+ push( '-' );
+ push( '-' );
+ return readComment( false );
+ }
+
+ private int readProcessingInstruction() throws IOException {
+ // "<?" has already been read
+ while ( ( numRead - lastMark ) < safeReadAheadLimit ) {
+ int ch = next();
+ if ( ch == '?' && peek() == '>' ) {
+ next();
+ return MATCH;
+ }
+ else if ( ch == -1 ) {
+ return MISMATCH;
+ }
+
+ }
+ return MISMATCH;
+ }
+
+
+ public int read() throws IOException {
+ // TODO: Do we ever want to preserve CDATA sections?
+ // where do we have to worry about them?
+ // <![ CDATA [ unescaped markup ]]>
+ if ( numWhitespace > 0 ) {
+ numEaten += numWhitespace;
+ addOffCorrectMap( numReturned, numEaten );
+ numWhitespace = 0;
+ }
+ numReturned++;
+ //do not limit this one by the READAHEAD
+ while ( true ) {
+ int lastNumRead = numRead;
+ int ch = next();
+
+ switch ( ch ) {
+ case '&':
+ saveState();
+ ch = readEntity();
+ if ( ch >= 0 ) {
+ return ch;
+ }
+ if ( ch == MISMATCH ) {
+ restoreState();
+
+ return '&';
+ }
+ break;
+
+ case '<':
+ saveState();
+ ch = next();
+ int ret = MISMATCH;
+ if ( ch == '!' ) {
+ ret = readBang( false );
+ }
+ else if ( ch == '/' ) {
+ ret = readName( true );
+ if ( ret == MATCH ) {
+ ch = nextSkipWS();
+ ret = ch == '>' ? MATCH : MISMATCH;
+ }
+ }
+ else if ( isAlpha( ch ) ) {
+ push( ch );
+ ret = readTag();
+ }
+ else if ( ch == '?' ) {
+ ret = readProcessingInstruction();
+ }
+
+ // matched something to be discarded, so break
+ // from this case and continue in the loop
+ if ( ret == MATCH ) {
+ //break;//was
+ //return whitespace from
+ numWhitespace = ( numRead - lastNumRead ) - 1;//tack on the -1 since we are
returning a space right now
+ return ' ';
+ }
+
+ // didn't match any HTML constructs, so roll back
+ // the stream state and just return '<'
+ restoreState();
+ return '<';
+
+ default:
+ return ch;
+ }
+
+ }
+
+
+ }
+
+ public int read(char cbuf[], int off, int len) throws IOException {
+ int i = 0;
+ for ( i = 0; i < len; i++ ) {
+ int ch = read();
+ if ( ch == -1 ) {
+ break;
+ }
+ cbuf[off++] = ( char ) ch;
+ }
+ if ( i == 0 ) {
+ if ( len == 0 ) {
+ return 0;
+ }
+ return -1;
+ }
+ return i;
+ }
+
+ public void close() throws IOException {
+ input.close();
+ }
+
+
+ private static final HashMap<String, Character> entityTable;
+
+ static {
+ entityTable = new HashMap<String, Character>();
+ // entityName and entityVal generated from the python script
+ // included in comments at the end of this file.
+ final String[] entityName = {
+ "zwnj",
+ "aring",
+ "gt",
+ "yen",
+ "ograve",
+ "Chi",
+ "delta",
+ "rang",
+ "sup",
+ "trade",
+ "Ntilde",
+ "xi",
+ "upsih",
+ "nbsp",
+ "Atilde",
+ "radic",
+ "otimes",
+ "aelig",
+ "oelig",
+ "equiv",
+ "ni",
+ "infin",
+ "Psi",
+ "auml",
+ "cup",
+ "Epsilon",
+ "otilde",
+ "lt",
+ "Icirc",
+ "Eacute",
+ "Lambda",
+ "sbquo",
+ "Prime",
+ "prime",
+ "psi",
+ "Kappa",
+ "rsaquo",
+ "Tau",
+ "uacute",
+ "ocirc",
+ "lrm",
+ "zwj",
+ "cedil",
+ "Alpha",
+ "not",
+ "amp",
+ "AElig",
+ "oslash",
+ "acute",
+ "lceil",
+ "alefsym",
+ "laquo",
+ "shy",
+ "loz",
+ "ge",
+ "Igrave",
+ "nu",
+ "Ograve",
+ "lsaquo",
+ "sube",
+ "euro",
+ "rarr",
+ "sdot",
+ "rdquo",
+ "Yacute",
+ "lfloor",
+ "lArr",
+ "Auml",
+ "Dagger",
+ "brvbar",
+ "Otilde",
+ "szlig",
+ "clubs",
+ "diams",
+ "agrave",
+ "Ocirc",
+ "Iota",
+ "Theta",
+ "Pi",
+ "zeta",
+ "Scaron",
+ "frac14",
+ "egrave",
+ "sub",
+ "iexcl",
+ "frac12",
+ "ordf",
+ "sum",
+ "prop",
+ "Uuml",
+ "ntilde",
+ "atilde",
+ "asymp",
+ "uml",
+ "prod",
+ "nsub",
+ "reg",
+ "rArr",
+ "Oslash",
+ "emsp",
+ "THORN",
+ "yuml",
+ "aacute",
+ "Mu",
+ "hArr",
+ "le",
+ "thinsp",
+ "dArr",
+ "ecirc",
+ "bdquo",
+ "Sigma",
+ "Aring",
+ "tilde",
+ "nabla",
+ "mdash",
+ "uarr",
+ "times",
+ "Ugrave",
+ "Eta",
+ "Agrave",
+ "chi",
+ "real",
+ "circ",
+ "eth",
+ "rceil",
+ "iuml",
+ "gamma",
+ "lambda",
+ "harr",
+ "Egrave",
+ "frac34",
+ "dagger",
+ "divide",
+ "Ouml",
+ "image",
+ "ndash",
+ "hellip",
+ "igrave",
+ "Yuml",
+ "ang",
+ "alpha",
+ "frasl",
+ "ETH",
+ "lowast",
+ "Nu",
+ "plusmn",
+ "bull",
+ "sup1",
+ "sup2",
+ "sup3",
+ "Aacute",
+ "cent",
+ "oline",
+ "Beta",
+ "perp",
+ "Delta",
+ "there4",
+ "pi",
+ "iota",
+ "empty",
+ "euml",
+ "notin",
+ "iacute",
+ "para",
+ "epsilon",
+ "weierp",
+ "OElig",
+ "uuml",
+ "larr",
+ "icirc",
+ "Upsilon",
+ "omicron",
+ "upsilon",
+ "copy",
+ "Iuml",
+ "Oacute",
+ "Xi",
+ "kappa",
+ "ccedil",
+ "Ucirc",
+ "cap",
+ "mu",
+ "scaron",
+ "lsquo",
+ "isin",
+ "Zeta",
+ "minus",
+ "deg",
+ "and",
+ "tau",
+ "pound",
+ "curren",
+ "int",
+ "ucirc",
+ "rfloor",
+ "ensp",
+ "crarr",
+ "ugrave",
+ "exist",
+ "cong",
+ "theta",
+ "oplus",
+ "permil",
+ "Acirc",
+ "piv",
+ "Euml",
+ "Phi",
+ "Iacute",
+ "quot",
+ "Uacute",
+ "Omicron",
+ "ne",
+ "iquest",
+ "eta",
+ "rsquo",
+ "yacute",
+ "Rho",
+ "darr",
+ "Ecirc",
+ "Omega",
+ "acirc",
+ "sim",
+ "phi",
+ "sigmaf",
+ "macr",
+ "thetasym",
+ "Ccedil",
+ "ordm",
+ "uArr",
+ "forall",
+ "beta",
+ "fnof",
+ "rho",
+ "micro",
+ "eacute",
+ "omega",
+ "middot",
+ "Gamma",
+ "rlm",
+ "lang",
+ "spades",
+ "supe",
+ "thorn",
+ "ouml",
+ "or",
+ "raquo",
+ "part",
+ "sect",
+ "ldquo",
+ "hearts",
+ "sigma",
+ "oacute"
+ };
+ final char[] entityVal = {
+ 8204,
+ 229,
+ 62,
+ 165,
+ 242,
+ 935,
+ 948,
+ 9002,
+ 8835,
+ 8482,
+ 209,
+ 958,
+ 978,
+ 160,
+ 195,
+ 8730,
+ 8855,
+ 230,
+ 339,
+ 8801,
+ 8715,
+ 8734,
+ 936,
+ 228,
+ 8746,
+ 917,
+ 245,
+ 60,
+ 206,
+ 201,
+ 923,
+ 8218,
+ 8243,
+ 8242,
+ 968,
+ 922,
+ 8250,
+ 932,
+ 250,
+ 244,
+ 8206,
+ 8205,
+ 184,
+ 913,
+ 172,
+ 38,
+ 198,
+ 248,
+ 180,
+ 8968,
+ 8501,
+ 171,
+ 173,
+ 9674,
+ 8805,
+ 204,
+ 957,
+ 210,
+ 8249,
+ 8838,
+ 8364,
+ 8594,
+ 8901,
+ 8221,
+ 221,
+ 8970,
+ 8656,
+ 196,
+ 8225,
+ 166,
+ 213,
+ 223,
+ 9827,
+ 9830,
+ 224,
+ 212,
+ 921,
+ 920,
+ 928,
+ 950,
+ 352,
+ 188,
+ 232,
+ 8834,
+ 161,
+ 189,
+ 170,
+ 8721,
+ 8733,
+ 220,
+ 241,
+ 227,
+ 8776,
+ 168,
+ 8719,
+ 8836,
+ 174,
+ 8658,
+ 216,
+ 8195,
+ 222,
+ 255,
+ 225,
+ 924,
+ 8660,
+ 8804,
+ 8201,
+ 8659,
+ 234,
+ 8222,
+ 931,
+ 197,
+ 732,
+ 8711,
+ 8212,
+ 8593,
+ 215,
+ 217,
+ 919,
+ 192,
+ 967,
+ 8476,
+ 710,
+ 240,
+ 8969,
+ 239,
+ 947,
+ 955,
+ 8596,
+ 200,
+ 190,
+ 8224,
+ 247,
+ 214,
+ 8465,
+ 8211,
+ 8230,
+ 236,
+ 376,
+ 8736,
+ 945,
+ 8260,
+ 208,
+ 8727,
+ 925,
+ 177,
+ 8226,
+ 185,
+ 178,
+ 179,
+ 193,
+ 162,
+ 8254,
+ 914,
+ 8869,
+ 916,
+ 8756,
+ 960,
+ 953,
+ 8709,
+ 235,
+ 8713,
+ 237,
+ 182,
+ 949,
+ 8472,
+ 338,
+ 252,
+ 8592,
+ 238,
+ 933,
+ 959,
+ 965,
+ 169,
+ 207,
+ 211,
+ 926,
+ 954,
+ 231,
+ 219,
+ 8745,
+ 956,
+ 353,
+ 8216,
+ 8712,
+ 918,
+ 8722,
+ 176,
+ 8743,
+ 964,
+ 163,
+ 164,
+ 8747,
+ 251,
+ 8971,
+ 8194,
+ 8629,
+ 249,
+ 8707,
+ 8773,
+ 952,
+ 8853,
+ 8240,
+ 194,
+ 982,
+ 203,
+ 934,
+ 205,
+ 34,
+ 218,
+ 927,
+ 8800,
+ 191,
+ 951,
+ 8217,
+ 253,
+ 929,
+ 8595,
+ 202,
+ 937,
+ 226,
+ 8764,
+ 966,
+ 962,
+ 175,
+ 977,
+ 199,
+ 186,
+ 8657,
+ 8704,
+ 946,
+ 402,
+ 961,
+ 181,
+ 233,
+ 969,
+ 183,
+ 915,
+ 8207,
+ 9001,
+ 9824,
+ 8839,
+ 254,
+ 246,
+ 8744,
+ 187,
+ 8706,
+ 167,
+ 8220,
+ 9829,
+ 963,
+ 243
+ };
+ for ( int i = 0; i < entityName.length; i++ ) {
+ entityTable.put( entityName[i], new Character( entityVal[i] ) );
+ }
+ // special-case nbsp to a simple space instead of 0xa0
+ entityTable.put( "nbsp", new Character( ' ' ) );
+ }
+
+}
+
+/********************* htmlentity.py **********************
+ # a simple python script to generate an HTML entity table
+ # from text taken from
http://www.w3.org/TR/REC-html40/sgml/entities.html
+
+ text="""
+ 24 Character entity references in HTML 4
+
+ Contents
+
+ 1. Introduction to character entity references
+ 2. Character entity references for ISO 8859-1 characters
+ 1. The list of characters
+ 3. Character entity references for symbols, mathematical symbols, and Greek letters
+ 1. The list of characters
+ 4. Character entity references for markup-significant and internationalization
characters
+ 1. The list of characters
+
+ 24.1 Introduction to character entity references
+ A character entity reference is an SGML construct that references a character of the
document character set.
+
+ This version of HTML supports several sets of character entity references:
+
+ * ISO 8859-1 (Latin-1) characters In accordance with section 14 of [RFC1866], the set of
Latin-1 entities has been extended by this specification to cover the whole right part of
ISO-8859-1 (all code positions with the high-order bit set), including the already
commonly used , © and ®. The names of the entities are taken
from the appendices of SGML (defined in [ISO8879]).
+ * symbols, mathematical symbols, and Greek letters. These characters may be represented
by glyphs in the Adobe font "Symbol".
+ * markup-significant and internationalization characters (e.g., for bidirectional
text).
+
+ The following sections present the complete lists of character entity references.
Although, by convention, [ISO10646] the comments following each entry are usually written
with uppercase letters, we have converted them to lowercase in this specification for
reasons of readability.
+ 24.2 Character entity references for ISO 8859-1 characters
+
+ The character entity references in this section produce characters whose numeric
equivalents should already be supported by conforming HTML 2.0 user agents. Thus, the
character entity reference ÷ is a more convenient form than ÷ for
obtaining the division sign.
+
+ To support these named entities, user agents need only recognize the entity names and
convert them to characters that lie within the repertoire of [ISO88591].
+
+ Character 65533 (FFFD hexadecimal) is the last valid character in UCS-2. 65534 (FFFE
hexadecimal) is unassigned and reserved as the byte-swapped version of ZERO WIDTH
NON-BREAKING SPACE for byte-order detection purposes. 65535 (FFFF hexadecimal) is
unassigned.
+ 24.2.1 The list of characters
+
+ <!-- Portions (c) International Organization for Standardization 1986
+ Permission to copy in any form is granted for use with
+ conforming SGML systems and applications as defined in
+ ISO 8879, provided this notice is included in all copies.
+ -->
+ <!-- Character entity set. Typical invocation:
+ <!ENTITY % HTMLlat1 PUBLIC
+ "-//W3C//ENTITIES Latin 1//EN//HTML">
+ %HTMLlat1;
+ -->
+
+ <!ENTITY nbsp CDATA " " -- no-break space = non-breaking space,
+ U+00A0 ISOnum -->
+ <!ENTITY iexcl CDATA "¡" -- inverted exclamation mark, U+00A1
ISOnum -->
+ <!ENTITY cent CDATA "¢" -- cent sign, U+00A2 ISOnum -->
+ <!ENTITY pound CDATA "£" -- pound sign, U+00A3 ISOnum -->
+ <!ENTITY curren CDATA "¤" -- currency sign, U+00A4 ISOnum -->
+ <!ENTITY yen CDATA "¥" -- yen sign = yuan sign, U+00A5 ISOnum
-->
+ <!ENTITY brvbar CDATA "¦" -- broken bar = broken vertical bar,
+ U+00A6 ISOnum -->
+ <!ENTITY sect CDATA "§" -- section sign, U+00A7 ISOnum -->
+ <!ENTITY uml CDATA "¨" -- diaeresis = spacing diaeresis,
+ U+00A8 ISOdia -->
+ <!ENTITY copy CDATA "©" -- copyright sign, U+00A9 ISOnum -->
+ <!ENTITY ordf CDATA "ª" -- feminine ordinal indicator, U+00AA
ISOnum -->
+ <!ENTITY laquo CDATA "«" -- left-pointing double angle quotation
mark
+ = left pointing guillemet, U+00AB ISOnum -->
+ <!ENTITY not CDATA "¬" -- not sign, U+00AC ISOnum -->
+ <!ENTITY shy CDATA "­" -- soft hyphen = discretionary hyphen,
+ U+00AD ISOnum -->
+ <!ENTITY reg CDATA "®" -- registered sign = registered trade
mark sign,
+ U+00AE ISOnum -->
+ <!ENTITY macr CDATA "¯" -- macron = spacing macron = overline
+ = APL overbar, U+00AF ISOdia -->
+ <!ENTITY deg CDATA "°" -- degree sign, U+00B0 ISOnum -->
+ <!ENTITY plusmn CDATA "±" -- plus-minus sign = plus-or-minus
sign,
+ U+00B1 ISOnum -->
+ <!ENTITY sup2 CDATA "²" -- superscript two = superscript digit
two
+ = squared, U+00B2 ISOnum -->
+ <!ENTITY sup3 CDATA "³" -- superscript three = superscript digit
three
+ = cubed, U+00B3 ISOnum -->
+ <!ENTITY acute CDATA "´" -- acute accent = spacing acute,
+ U+00B4 ISOdia -->
+ <!ENTITY micro CDATA "µ" -- micro sign, U+00B5 ISOnum -->
+ <!ENTITY para CDATA "¶" -- pilcrow sign = paragraph sign,
+ U+00B6 ISOnum -->
+ <!ENTITY middot CDATA "·" -- middle dot = Georgian comma
+ = Greek middle dot, U+00B7 ISOnum -->
+ <!ENTITY cedil CDATA "¸" -- cedilla = spacing cedilla, U+00B8
ISOdia -->
+ <!ENTITY sup1 CDATA "¹" -- superscript one = superscript digit
one,
+ U+00B9 ISOnum -->
+ <!ENTITY ordm CDATA "º" -- masculine ordinal indicator,
+ U+00BA ISOnum -->
+ <!ENTITY raquo CDATA "»" -- right-pointing double angle quotation
mark
+ = right pointing guillemet, U+00BB ISOnum -->
+ <!ENTITY frac14 CDATA "¼" -- vulgar fraction one quarter
+ = fraction one quarter, U+00BC ISOnum -->
+ <!ENTITY frac12 CDATA "½" -- vulgar fraction one half
+ = fraction one half, U+00BD ISOnum -->
+ <!ENTITY frac34 CDATA "¾" -- vulgar fraction three quarters
+ = fraction three quarters, U+00BE ISOnum -->
+ <!ENTITY iquest CDATA "¿" -- inverted question mark
+ = turned question mark, U+00BF ISOnum -->
+ <!ENTITY Agrave CDATA "À" -- latin capital letter A with grave
+ = latin capital letter A grave,
+ U+00C0 ISOlat1 -->
+ <!ENTITY Aacute CDATA "Á" -- latin capital letter A with acute,
+ U+00C1 ISOlat1 -->
+ <!ENTITY Acirc CDATA "Â" -- latin capital letter A with
circumflex,
+ U+00C2 ISOlat1 -->
+ <!ENTITY Atilde CDATA "Ã" -- latin capital letter A with tilde,
+ U+00C3 ISOlat1 -->
+ <!ENTITY Auml CDATA "Ä" -- latin capital letter A with
diaeresis,
+ U+00C4 ISOlat1 -->
+ <!ENTITY Aring CDATA "Å" -- latin capital letter A with ring
above
+ = latin capital letter A ring,
+ U+00C5 ISOlat1 -->
+ <!ENTITY AElig CDATA "Æ" -- latin capital letter AE
+ = latin capital ligature AE,
+ U+00C6 ISOlat1 -->
+ <!ENTITY Ccedil CDATA "Ç" -- latin capital letter C with cedilla,
+ U+00C7 ISOlat1 -->
+ <!ENTITY Egrave CDATA "È" -- latin capital letter E with grave,
+ U+00C8 ISOlat1 -->
+ <!ENTITY Eacute CDATA "É" -- latin capital letter E with acute,
+ U+00C9 ISOlat1 -->
+ <!ENTITY Ecirc CDATA "Ê" -- latin capital letter E with
circumflex,
+ U+00CA ISOlat1 -->
+ <!ENTITY Euml CDATA "Ë" -- latin capital letter E with
diaeresis,
+ U+00CB ISOlat1 -->
+ <!ENTITY Igrave CDATA "Ì" -- latin capital letter I with grave,
+ U+00CC ISOlat1 -->
+ <!ENTITY Iacute CDATA "Í" -- latin capital letter I with acute,
+ U+00CD ISOlat1 -->
+ <!ENTITY Icirc CDATA "Î" -- latin capital letter I with
circumflex,
+ U+00CE ISOlat1 -->
+ <!ENTITY Iuml CDATA "Ï" -- latin capital letter I with
diaeresis,
+ U+00CF ISOlat1 -->
+ <!ENTITY ETH CDATA "Ð" -- latin capital letter ETH, U+00D0
ISOlat1 -->
+ <!ENTITY Ntilde CDATA "Ñ" -- latin capital letter N with tilde,
+ U+00D1 ISOlat1 -->
+ <!ENTITY Ograve CDATA "Ò" -- latin capital letter O with grave,
+ U+00D2 ISOlat1 -->
+ <!ENTITY Oacute CDATA "Ó" -- latin capital letter O with acute,
+ U+00D3 ISOlat1 -->
+ <!ENTITY Ocirc CDATA "Ô" -- latin capital letter O with
circumflex,
+ U+00D4 ISOlat1 -->
+ <!ENTITY Otilde CDATA "Õ" -- latin capital letter O with tilde,
+ U+00D5 ISOlat1 -->
+ <!ENTITY Ouml CDATA "Ö" -- latin capital letter O with
diaeresis,
+ U+00D6 ISOlat1 -->
+ <!ENTITY times CDATA "×" -- multiplication sign, U+00D7 ISOnum
-->
+ <!ENTITY Oslash CDATA "Ø" -- latin capital letter O with stroke
+ = latin capital letter O slash,
+ U+00D8 ISOlat1 -->
+ <!ENTITY Ugrave CDATA "Ù" -- latin capital letter U with grave,
+ U+00D9 ISOlat1 -->
+ <!ENTITY Uacute CDATA "Ú" -- latin capital letter U with acute,
+ U+00DA ISOlat1 -->
+ <!ENTITY Ucirc CDATA "Û" -- latin capital letter U with
circumflex,
+ U+00DB ISOlat1 -->
+ <!ENTITY Uuml CDATA "Ü" -- latin capital letter U with
diaeresis,
+ U+00DC ISOlat1 -->
+ <!ENTITY Yacute CDATA "Ý" -- latin capital letter Y with acute,
+ U+00DD ISOlat1 -->
+ <!ENTITY THORN CDATA "Þ" -- latin capital letter THORN,
+ U+00DE ISOlat1 -->
+ <!ENTITY szlig CDATA "ß" -- latin small letter sharp s =
ess-zed,
+ U+00DF ISOlat1 -->
+ <!ENTITY agrave CDATA "à" -- latin small letter a with grave
+ = latin small letter a grave,
+ U+00E0 ISOlat1 -->
+ <!ENTITY aacute CDATA "á" -- latin small letter a with acute,
+ U+00E1 ISOlat1 -->
+ <!ENTITY acirc CDATA "â" -- latin small letter a with
circumflex,
+ U+00E2 ISOlat1 -->
+ <!ENTITY atilde CDATA "ã" -- latin small letter a with tilde,
+ U+00E3 ISOlat1 -->
+ <!ENTITY auml CDATA "ä" -- latin small letter a with diaeresis,
+ U+00E4 ISOlat1 -->
+ <!ENTITY aring CDATA "å" -- latin small letter a with ring above
+ = latin small letter a ring,
+ U+00E5 ISOlat1 -->
+ <!ENTITY aelig CDATA "æ" -- latin small letter ae
+ = latin small ligature ae, U+00E6 ISOlat1 -->
+ <!ENTITY ccedil CDATA "ç" -- latin small letter c with cedilla,
+ U+00E7 ISOlat1 -->
+ <!ENTITY egrave CDATA "è" -- latin small letter e with grave,
+ U+00E8 ISOlat1 -->
+ <!ENTITY eacute CDATA "é" -- latin small letter e with acute,
+ U+00E9 ISOlat1 -->
+ <!ENTITY ecirc CDATA "ê" -- latin small letter e with
circumflex,
+ U+00EA ISOlat1 -->
+ <!ENTITY euml CDATA "ë" -- latin small letter e with diaeresis,
+ U+00EB ISOlat1 -->
+ <!ENTITY igrave CDATA "ì" -- latin small letter i with grave,
+ U+00EC ISOlat1 -->
+ <!ENTITY iacute CDATA "í" -- latin small letter i with acute,
+ U+00ED ISOlat1 -->
+ <!ENTITY icirc CDATA "î" -- latin small letter i with
circumflex,
+ U+00EE ISOlat1 -->
+ <!ENTITY iuml CDATA "ï" -- latin small letter i with diaeresis,
+ U+00EF ISOlat1 -->
+ <!ENTITY eth CDATA "ð" -- latin small letter eth, U+00F0
ISOlat1 -->
+ <!ENTITY ntilde CDATA "ñ" -- latin small letter n with tilde,
+ U+00F1 ISOlat1 -->
+ <!ENTITY ograve CDATA "ò" -- latin small letter o with grave,
+ U+00F2 ISOlat1 -->
+ <!ENTITY oacute CDATA "ó" -- latin small letter o with acute,
+ U+00F3 ISOlat1 -->
+ <!ENTITY ocirc CDATA "ô" -- latin small letter o with
circumflex,
+ U+00F4 ISOlat1 -->
+ <!ENTITY otilde CDATA "õ" -- latin small letter o with tilde,
+ U+00F5 ISOlat1 -->
+ <!ENTITY ouml CDATA "ö" -- latin small letter o with diaeresis,
+ U+00F6 ISOlat1 -->
+ <!ENTITY divide CDATA "÷" -- division sign, U+00F7 ISOnum -->
+ <!ENTITY oslash CDATA "ø" -- latin small letter o with stroke,
+ = latin small letter o slash,
+ U+00F8 ISOlat1 -->
+ <!ENTITY ugrave CDATA "ù" -- latin small letter u with grave,
+ U+00F9 ISOlat1 -->
+ <!ENTITY uacute CDATA "ú" -- latin small letter u with acute,
+ U+00FA ISOlat1 -->
+ <!ENTITY ucirc CDATA "û" -- latin small letter u with
circumflex,
+ U+00FB ISOlat1 -->
+ <!ENTITY uuml CDATA "ü" -- latin small letter u with diaeresis,
+ U+00FC ISOlat1 -->
+ <!ENTITY yacute CDATA "ý" -- latin small letter y with acute,
+ U+00FD ISOlat1 -->
+ <!ENTITY thorn CDATA "þ" -- latin small letter thorn,
+ U+00FE ISOlat1 -->
+ <!ENTITY yuml CDATA "ÿ" -- latin small letter y with diaeresis,
+ U+00FF ISOlat1 -->
+
+ 24.3 Character entity references for symbols, mathematical symbols, and Greek letters
+
+ The character entity references in this section produce characters that may be
represented by glyphs in the widely available Adobe Symbol font, including Greek
characters, various bracketing symbols, and a selection of mathematical operators such as
gradient, product, and summation symbols.
+
+ To support these entities, user agents may support full [ISO10646] or use other means.
Display of glyphs for these characters may be obtained by being able to display the
relevant [ISO10646] characters or by other means, such as internally mapping the listed
entities, numeric character references, and characters to the appropriate position in some
font that contains the requisite glyphs.
+
+ When to use Greek entities. This entity set contains all the letters used in modern
Greek. However, it does not include Greek punctuation, precomposed accented characters nor
the non-spacing accents (tonos, dialytika) required to compose them. There are no archaic
letters, Coptic-unique letters, or precomposed letters for Polytonic Greek. The entities
defined here are not intended for the representation of modern Greek text and would not be
an efficient representation; rather, they are intended for occasional Greek letters used
in technical and mathematical works.
+ 24.3.1 The list of characters
+
+ <!-- Mathematical, Greek and Symbolic characters for HTML -->
+
+ <!-- Character entity set. Typical invocation:
+ <!ENTITY % HTMLsymbol PUBLIC
+ "-//W3C//ENTITIES Symbols//EN//HTML">
+ %HTMLsymbol; -->
+
+ <!-- Portions (c) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with
+ conforming SGML systems and applications as defined in
+ ISO 8879, provided this notice is included in all copies.
+ -->
+
+ <!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 list) do not clash with any
+ existing ISO 8879 entity names. ISO 10646 character numbers
+ are given for each character, in hex. CDATA values are decimal
+ conversions of the ISO 10646 values and refer to the document
+ character set. Names are ISO 10646 names.
+
+ -->
+
+ <!-- Latin Extended-B -->
+ <!ENTITY fnof CDATA "ƒ" -- latin small f with hook = function
+ = florin, U+0192 ISOtech -->
+
+ <!-- Greek -->
+ <!ENTITY Alpha CDATA "Α" -- greek capital letter alpha, U+0391
-->
+ <!ENTITY Beta CDATA "Β" -- greek capital letter beta, U+0392
-->
+ <!ENTITY Gamma CDATA "Γ" -- greek capital letter gamma,
+ U+0393 ISOgrk3 -->
+ <!ENTITY Delta CDATA "Δ" -- greek capital letter delta,
+ U+0394 ISOgrk3 -->
+ <!ENTITY Epsilon CDATA "Ε" -- greek capital letter epsilon,
U+0395 -->
+ <!ENTITY Zeta CDATA "Ζ" -- greek capital letter zeta, U+0396
-->
+ <!ENTITY Eta CDATA "Η" -- greek capital letter eta, U+0397
-->
+ <!ENTITY Theta CDATA "Θ" -- greek capital letter theta,
+ U+0398 ISOgrk3 -->
+ <!ENTITY Iota CDATA "Ι" -- greek capital letter iota, U+0399
-->
+ <!ENTITY Kappa CDATA "Κ" -- greek capital letter kappa, U+039A
-->
+ <!ENTITY Lambda CDATA "Λ" -- greek capital letter lambda,
+ U+039B ISOgrk3 -->
+ <!ENTITY Mu CDATA "Μ" -- greek capital letter mu, U+039C
-->
+ <!ENTITY Nu CDATA "Ν" -- greek capital letter nu, U+039D
-->
+ <!ENTITY Xi CDATA "Ξ" -- greek capital letter xi, U+039E
ISOgrk3 -->
+ <!ENTITY Omicron CDATA "Ο" -- greek capital letter omicron,
U+039F -->
+ <!ENTITY Pi CDATA "Π" -- greek capital letter pi, U+03A0
ISOgrk3 -->
+ <!ENTITY Rho CDATA "Ρ" -- greek capital letter rho, U+03A1
-->
+ <!-- there is no Sigmaf, and no U+03A2 character either -->
+ <!ENTITY Sigma CDATA "Σ" -- greek capital letter sigma,
+ U+03A3 ISOgrk3 -->
+ <!ENTITY Tau CDATA "Τ" -- greek capital letter tau, U+03A4
-->
+ <!ENTITY Upsilon CDATA "Υ" -- greek capital letter upsilon,
+ U+03A5 ISOgrk3 -->
+ <!ENTITY Phi CDATA "Φ" -- greek capital letter phi,
+ U+03A6 ISOgrk3 -->
+ <!ENTITY Chi CDATA "Χ" -- greek capital letter chi, U+03A7
-->
+ <!ENTITY Psi CDATA "Ψ" -- greek capital letter psi,
+ U+03A8 ISOgrk3 -->
+ <!ENTITY Omega CDATA "Ω" -- greek capital letter omega,
+ U+03A9 ISOgrk3 -->
+
+ <!ENTITY alpha CDATA "α" -- greek small letter alpha,
+ U+03B1 ISOgrk3 -->
+ <!ENTITY beta CDATA "β" -- greek small letter beta, U+03B2
ISOgrk3 -->
+ <!ENTITY gamma CDATA "γ" -- greek small letter gamma,
+ U+03B3 ISOgrk3 -->
+ <!ENTITY delta CDATA "δ" -- greek small letter delta,
+ U+03B4 ISOgrk3 -->
+ <!ENTITY epsilon CDATA "ε" -- greek small letter epsilon,
+ U+03B5 ISOgrk3 -->
+ <!ENTITY zeta CDATA "ζ" -- greek small letter zeta, U+03B6
ISOgrk3 -->
+ <!ENTITY eta CDATA "η" -- greek small letter eta, U+03B7
ISOgrk3 -->
+ <!ENTITY theta CDATA "θ" -- greek small letter theta,
+ U+03B8 ISOgrk3 -->
+ <!ENTITY iota CDATA "ι" -- greek small letter iota, U+03B9
ISOgrk3 -->
+ <!ENTITY kappa CDATA "κ" -- greek small letter kappa,
+ U+03BA ISOgrk3 -->
+ <!ENTITY lambda CDATA "λ" -- greek small letter lambda,
+ U+03BB ISOgrk3 -->
+ <!ENTITY mu CDATA "μ" -- greek small letter mu, U+03BC
ISOgrk3 -->
+ <!ENTITY nu CDATA "ν" -- greek small letter nu, U+03BD
ISOgrk3 -->
+ <!ENTITY xi CDATA "ξ" -- greek small letter xi, U+03BE
ISOgrk3 -->
+ <!ENTITY omicron CDATA "ο" -- greek small letter omicron, U+03BF
NEW -->
+ <!ENTITY pi CDATA "π" -- greek small letter pi, U+03C0
ISOgrk3 -->
+ <!ENTITY rho CDATA "ρ" -- greek small letter rho, U+03C1
ISOgrk3 -->
+ <!ENTITY sigmaf CDATA "ς" -- greek small letter final sigma,
+ U+03C2 ISOgrk3 -->
+ <!ENTITY sigma CDATA "σ" -- greek small letter sigma,
+ U+03C3 ISOgrk3 -->
+ <!ENTITY tau CDATA "τ" -- greek small letter tau, U+03C4
ISOgrk3 -->
+ <!ENTITY upsilon CDATA "υ" -- greek small letter upsilon,
+ U+03C5 ISOgrk3 -->
+ <!ENTITY phi CDATA "φ" -- greek small letter phi, U+03C6
ISOgrk3 -->
+ <!ENTITY chi CDATA "χ" -- greek small letter chi, U+03C7
ISOgrk3 -->
+ <!ENTITY psi CDATA "ψ" -- greek small letter psi, U+03C8
ISOgrk3 -->
+ <!ENTITY omega CDATA "ω" -- greek small letter omega,
+ U+03C9 ISOgrk3 -->
+ <!ENTITY thetasym CDATA "ϑ" -- greek small letter theta symbol,
+ U+03D1 NEW -->
+ <!ENTITY upsih CDATA "ϒ" -- greek upsilon with hook symbol,
+ U+03D2 NEW -->
+ <!ENTITY piv CDATA "ϖ" -- greek pi symbol, U+03D6 ISOgrk3
-->
+
+ <!-- General Punctuation -->
+ <!ENTITY bull CDATA "•" -- bullet = black small circle,
+ U+2022 ISOpub -->
+ <!-- bullet is NOT the same as bullet operator, U+2219 -->
+ <!ENTITY hellip CDATA "…" -- horizontal ellipsis = three dot
leader,
+ U+2026 ISOpub -->
+ <!ENTITY prime CDATA "′" -- prime = minutes = feet, U+2032
ISOtech -->
+ <!ENTITY Prime CDATA "″" -- double prime = seconds = inches,
+ U+2033 ISOtech -->
+ <!ENTITY oline CDATA "‾" -- overline = spacing overscore,
+ U+203E NEW -->
+ <!ENTITY frasl CDATA "⁄" -- fraction slash, U+2044 NEW -->
+
+ <!-- Letterlike Symbols -->
+ <!ENTITY weierp CDATA "℘" -- script capital P = power set
+ = Weierstrass p, U+2118 ISOamso -->
+ <!ENTITY image CDATA "ℑ" -- blackletter capital I = imaginary
part,
+ U+2111 ISOamso -->
+ <!ENTITY real CDATA "ℜ" -- blackletter capital R = real part
symbol,
+ U+211C ISOamso -->
+ <!ENTITY trade CDATA "™" -- trade mark sign, U+2122 ISOnum
-->
+ <!ENTITY alefsym CDATA "ℵ" -- alef symbol = first transfinite
cardinal,
+ U+2135 NEW -->
+ <!-- alef symbol is NOT the same as hebrew letter alef,
+ U+05D0 although the same glyph could be used to depict both characters -->
+
+ <!-- Arrows -->
+ <!ENTITY larr CDATA "←" -- leftwards arrow, U+2190 ISOnum
-->
+ <!ENTITY uarr CDATA "↑" -- upwards arrow, U+2191
ISOnum-->
+ <!ENTITY rarr CDATA "→" -- rightwards arrow, U+2192 ISOnum
-->
+ <!ENTITY darr CDATA "↓" -- downwards arrow, U+2193 ISOnum
-->
+ <!ENTITY harr CDATA "↔" -- left right arrow, U+2194 ISOamsa
-->
+ <!ENTITY crarr CDATA "↵" -- downwards arrow with corner
leftwards
+ = carriage return, U+21B5 NEW -->
+ <!ENTITY lArr CDATA "⇐" -- leftwards double arrow, U+21D0
ISOtech -->
+ <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
arrow
+ but also does not have any other character for that function. So ? lArr can
+ be used for 'is implied by' as ISOtech suggests -->
+ <!ENTITY uArr CDATA "⇑" -- upwards double arrow, U+21D1
ISOamsa -->
+ <!ENTITY rArr CDATA "⇒" -- rightwards double arrow,
+ U+21D2 ISOtech -->
+ <!-- ISO 10646 does not say this is the 'implies' character but does not
have
+ another character with this function so ?
+ rArr can be used for 'implies' as ISOtech suggests -->
+ <!ENTITY dArr CDATA "⇓" -- downwards double arrow, U+21D3
ISOamsa -->
+ <!ENTITY hArr CDATA "⇔" -- left right double arrow,
+ U+21D4 ISOamsa -->
+
+ <!-- Mathematical Operators -->
+ <!ENTITY forall CDATA "∀" -- for all, U+2200 ISOtech -->
+ <!ENTITY part CDATA "∂" -- partial differential, U+2202
ISOtech -->
+ <!ENTITY exist CDATA "∃" -- there exists, U+2203 ISOtech
-->
+ <!ENTITY empty CDATA "∅" -- empty set = null set = diameter,
+ U+2205 ISOamso -->
+ <!ENTITY nabla CDATA "∇" -- nabla = backward difference,
+ U+2207 ISOtech -->
+ <!ENTITY isin CDATA "∈" -- element of, U+2208 ISOtech -->
+ <!ENTITY notin CDATA "∉" -- not an element of, U+2209 ISOtech
-->
+ <!ENTITY ni CDATA "∋" -- contains as member, U+220B ISOtech
-->
+ <!-- should there be a more memorable name than 'ni'? -->
+ <!ENTITY prod CDATA "∏" -- n-ary product = product sign,
+ U+220F ISOamsb -->
+ <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
though
+ the same glyph might be used for both -->
+ <!ENTITY sum CDATA "∑" -- n-ary sumation, U+2211 ISOamsb
-->
+ <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
+ though the same glyph might be used for both -->
+ <!ENTITY minus CDATA "−" -- minus sign, U+2212 ISOtech -->
+ <!ENTITY lowast CDATA "∗" -- asterisk operator, U+2217 ISOtech
-->
+ <!ENTITY radic CDATA "√" -- square root = radical sign,
+ U+221A ISOtech -->
+ <!ENTITY prop CDATA "∝" -- proportional to, U+221D ISOtech
-->
+ <!ENTITY infin CDATA "∞" -- infinity, U+221E ISOtech -->
+ <!ENTITY ang CDATA "∠" -- angle, U+2220 ISOamso -->
+ <!ENTITY and CDATA "∧" -- logical and = wedge, U+2227
ISOtech -->
+ <!ENTITY or CDATA "∨" -- logical or = vee, U+2228 ISOtech
-->
+ <!ENTITY cap CDATA "∩" -- intersection = cap, U+2229 ISOtech
-->
+ <!ENTITY cup CDATA "∪" -- union = cup, U+222A ISOtech
-->
+ <!ENTITY int CDATA "∫" -- integral, U+222B ISOtech -->
+ <!ENTITY there4 CDATA "∴" -- therefore, U+2234 ISOtech -->
+ <!ENTITY sim CDATA "∼" -- tilde operator = varies with =
similar to,
+ U+223C ISOtech -->
+ <!-- tilde operator is NOT the same character as the tilde, U+007E,
+ although the same glyph might be used to represent both -->
+ <!ENTITY cong CDATA "≅" -- approximately equal to, U+2245
ISOtech -->
+ <!ENTITY asymp CDATA "≈" -- almost equal to = asymptotic to,
+ U+2248 ISOamsr -->
+ <!ENTITY ne CDATA "≠" -- not equal to, U+2260 ISOtech
-->
+ <!ENTITY equiv CDATA "≡" -- identical to, U+2261 ISOtech
-->
+ <!ENTITY le CDATA "≤" -- less-than or equal to, U+2264
ISOtech -->
+ <!ENTITY ge CDATA "≥" -- greater-than or equal to,
+ U+2265 ISOtech -->
+ <!ENTITY sub CDATA "⊂" -- subset of, U+2282 ISOtech -->
+ <!ENTITY sup CDATA "⊃" -- superset of, U+2283 ISOtech
-->
+ <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
Symbol
+ font encoding and is not included. Should it be, for symmetry?
+ It is in ISOamsn -->
+ <!ENTITY nsub CDATA "⊄" -- not a subset of, U+2284 ISOamsn
-->
+ <!ENTITY sube CDATA "⊆" -- subset of or equal to, U+2286
ISOtech -->
+ <!ENTITY supe CDATA "⊇" -- superset of or equal to,
+ U+2287 ISOtech -->
+ <!ENTITY oplus CDATA "⊕" -- circled plus = direct sum,
+ U+2295 ISOamsb -->
+ <!ENTITY otimes CDATA "⊗" -- circled times = vector product,
+ U+2297 ISOamsb -->
+ <!ENTITY perp CDATA "⊥" -- up tack = orthogonal to =
perpendicular,
+ U+22A5 ISOtech -->
+ <!ENTITY sdot CDATA "⋅" -- dot operator, U+22C5 ISOamsb
-->
+ <!-- dot operator is NOT the same character as U+00B7 middle dot -->
+
+ <!-- Miscellaneous Technical -->
+ <!ENTITY lceil CDATA "⌈" -- left ceiling = apl upstile,
+ U+2308 ISOamsc -->
+ <!ENTITY rceil CDATA "⌉" -- right ceiling, U+2309 ISOamsc
-->
+ <!ENTITY lfloor CDATA "⌊" -- left floor = apl downstile,
+ U+230A ISOamsc -->
+ <!ENTITY rfloor CDATA "⌋" -- right floor, U+230B ISOamsc
-->
+ <!ENTITY lang CDATA "〈" -- left-pointing angle bracket =
bra,
+ U+2329 ISOtech -->
+ <!-- lang is NOT the same character as U+003C 'less than'
+ or U+2039 'single left-pointing angle quotation mark' -->
+ <!ENTITY rang CDATA "〉" -- right-pointing angle bracket =
ket,
+ U+232A ISOtech -->
+ <!-- rang is NOT the same character as U+003E 'greater than'
+ or U+203A 'single right-pointing angle quotation mark' -->
+
+ <!-- Geometric Shapes -->
+ <!ENTITY loz CDATA "◊" -- lozenge, U+25CA ISOpub -->
+
+ <!-- Miscellaneous Symbols -->
+ <!ENTITY spades CDATA "♠" -- black spade suit, U+2660 ISOpub
-->
+ <!-- black here seems to mean filled as opposed to hollow -->
+ <!ENTITY clubs CDATA "♣" -- black club suit = shamrock,
+ U+2663 ISOpub -->
+ <!ENTITY hearts CDATA "♥" -- black heart suit = valentine,
+ U+2665 ISOpub -->
+ <!ENTITY diams CDATA "♦" -- black diamond suit, U+2666 ISOpub
-->
+
+ 24.4 Character entity references for markup-significant and internationalization
characters
+
+ The character entity references in this section are for escaping markup-significant
characters (these are the same as those in HTML 2.0 and 3.2), for denoting spaces and
dashes. Other characters in this section apply to internationalization issues such as the
disambiguation of bidirectional text (see the section on bidirectional text for details).
+
+ Entities have also been added for the remaining characters occurring in CP-1252 which do
not occur in the HTMLlat1 or HTMLsymbol entity sets. These all occur in the 128 to 159
range within the CP-1252 charset. These entities permit the characters to be denoted in a
platform-independent manner.
+
+ To support these entities, user agents may support full [ISO10646] or use other means.
Display of glyphs for these characters may be obtained by being able to display the
relevant [ISO10646] characters or by other means, such as internally mapping the listed
entities, numeric character references, and characters to the appropriate position in some
font that contains the requisite glyphs.
+ 24.4.1 The list of characters
+
+ <!-- Special characters for HTML -->
+
+ <!-- Character entity set. Typical invocation:
+ <!ENTITY % HTMLspecial PUBLIC
+ "-//W3C//ENTITIES Special//EN//HTML">
+ %HTMLspecial; -->
+
+ <!-- Portions (c) International Organization for Standardization 1986:
+ Permission to copy in any form is granted for use with
+ conforming SGML systems and applications as defined in
+ ISO 8879, provided this notice is included in all copies.
+ -->
+
+ <!-- Relevant ISO entity set is given unless names are newly introduced.
+ New names (i.e., not in ISO 8879 list) do not clash with any
+ existing ISO 8879 entity names. ISO 10646 character numbers
+ are given for each character, in hex. CDATA values are decimal
+ conversions of the ISO 10646 values and refer to the document
+ character set. Names are ISO 10646 names.
+
+ -->
+
+ <!-- C0 Controls and Basic Latin -->
+ <!ENTITY quot CDATA """ -- quotation mark = APL quote,
+ U+0022 ISOnum -->
+ <!ENTITY amp CDATA "&" -- ampersand, U+0026 ISOnum -->
+ <!ENTITY lt CDATA "<" -- less-than sign, U+003C ISOnum
-->
+ <!ENTITY gt CDATA ">" -- greater-than sign, U+003E ISOnum
-->
+
+ <!-- Latin Extended-A -->
+ <!ENTITY OElig CDATA "Œ" -- latin capital ligature OE,
+ U+0152 ISOlat2 -->
+ <!ENTITY oelig CDATA "œ" -- latin small ligature oe, U+0153
ISOlat2 -->
+ <!-- ligature is a misnomer, this is a separate character in some languages -->
+ <!ENTITY Scaron CDATA "Š" -- latin capital letter S with caron,
+ U+0160 ISOlat2 -->
+ <!ENTITY scaron CDATA "š" -- latin small letter s with caron,
+ U+0161 ISOlat2 -->
+ <!ENTITY Yuml CDATA "Ÿ" -- latin capital letter Y with
diaeresis,
+ U+0178 ISOlat2 -->
+
+ <!-- Spacing Modifier Letters -->
+ <!ENTITY circ CDATA "ˆ" -- modifier letter circumflex accent,
+ U+02C6 ISOpub -->
+ <!ENTITY tilde CDATA "˜" -- small tilde, U+02DC ISOdia -->
+
+ <!-- General Punctuation -->
+ <!ENTITY ensp CDATA " " -- en space, U+2002 ISOpub -->
+ <!ENTITY emsp CDATA " " -- em space, U+2003 ISOpub -->
+ <!ENTITY thinsp CDATA " " -- thin space, U+2009 ISOpub -->
+ <!ENTITY zwnj CDATA "‌" -- zero width non-joiner,
+ U+200C NEW RFC 2070 -->
+ <!ENTITY zwj CDATA "‍" -- zero width joiner, U+200D NEW RFC
2070 -->
+ <!ENTITY lrm CDATA "‎" -- left-to-right mark, U+200E NEW RFC
2070 -->
+ <!ENTITY rlm CDATA "‏" -- right-to-left mark, U+200F NEW RFC
2070 -->
+ <!ENTITY ndash CDATA "–" -- en dash, U+2013 ISOpub -->
+ <!ENTITY mdash CDATA "—" -- em dash, U+2014 ISOpub -->
+ <!ENTITY lsquo CDATA "‘" -- left single quotation mark,
+ U+2018 ISOnum -->
+ <!ENTITY rsquo CDATA "’" -- right single quotation mark,
+ U+2019 ISOnum -->
+ <!ENTITY sbquo CDATA "‚" -- single low-9 quotation mark, U+201A
NEW -->
+ <!ENTITY ldquo CDATA "“" -- left double quotation mark,
+ U+201C ISOnum -->
+ <!ENTITY rdquo CDATA "”" -- right double quotation mark,
+ U+201D ISOnum -->
+ <!ENTITY bdquo CDATA "„" -- double low-9 quotation mark, U+201E
NEW -->
+ <!ENTITY dagger CDATA "†" -- dagger, U+2020 ISOpub -->
+ <!ENTITY Dagger CDATA "‡" -- double dagger, U+2021 ISOpub
-->
+ <!ENTITY permil CDATA "‰" -- per mille sign, U+2030 ISOtech
-->
+ <!ENTITY lsaquo CDATA "‹" -- single left-pointing angle
quotation mark,
+ U+2039 ISO proposed -->
+ <!-- lsaquo is proposed but not yet ISO standardized -->
+ <!ENTITY rsaquo CDATA "›" -- single right-pointing angle
quotation mark,
+ U+203A ISO proposed -->
+ <!-- rsaquo is proposed but not yet ISO standardized -->
+ <!ENTITY euro CDATA "€" -- euro sign, U+20AC NEW -->
+ """
+
+ codes={}
+ for line in text.split('\n'):
+ parts = line.split()
+ if len(parts)<3 or parts[0]!='<!ENTITY' or parts[2]!='CDATA':
continue
+ codes[parts[1]] = parts[3].strip('&#";')
+
+ print 'entityName={', ','.join([ '"'+key+'"'
for key in codes]), '};'
+ print 'entityVal={', ','.join([ str(codes[key]) for key in codes]),
'};'
+
+
+ ********************** end htmlentity.py ********************/
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+package org.apache.solr.analysis;
+
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.CharStream;
+
+public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
+
+ public HTMLStripCharFilter create(CharStream input) {
+ return new HTMLStripCharFilter( input );
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripReader.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripReader.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripReader.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,59 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharReader;
+
+/**
+ * A Reader that wraps another reader and attempts to strip out HTML constructs.
+ *
+ * @version $Id: HTMLStripReader.java 802263 2009-08-07 23:05:05Z koji $
+ * @deprecated Use {@link HTMLStripCharFilter}
+ */
+@Deprecated
+public class HTMLStripReader extends HTMLStripCharFilter {
+
+ public static void main(String[] args) throws IOException {
+ Reader in = new HTMLStripReader(
+ new InputStreamReader( System.in )
+ );
+ int ch;
+ while ( ( ch = in.read() ) != -1 ) {
+ System.out.print( ( char ) ch );
+ }
+ }
+
+ public HTMLStripReader(Reader source) {
+ super( CharReader.get( source.markSupported() ? source : new BufferedReader( source ) )
);
+ }
+
+ public HTMLStripReader(Reader source, Set<String> escapedTags) {
+ super( CharReader.get( source ), escapedTags );
+ }
+
+ public HTMLStripReader(Reader source, Set<String> escapedTags, int readAheadLimit)
{
+ super( CharReader.get( source ), escapedTags, readAheadLimit );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HTMLStripWhitespaceTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * @version $Id: HTMLStripWhitespaceTokenizerFactory.java 807338 2009-08-24 18:58:22Z
ryan $
+ * @deprecated Use {@link HTMLStripCharFilterFactory} and {@link
WhitespaceTokenizerFactory}
+ */
+@Deprecated
+public class HTMLStripWhitespaceTokenizerFactory extends BaseTokenizerFactory {
+ public Tokenizer create(Reader input) {
+ return new WhitespaceTokenizer( new HTMLStripReader( input ) ) {
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset( new HTMLStripReader( input ) );
+ }
+ };
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,144 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * When the plain text is extracted from documents, we will often have many words
hyphenated and broken into
+ * two lines. This is often the case with documents where narrow text columns are used,
such as newsletters.
+ * In order to increase search efficiency, this filter puts hyphenated words broken into
two lines back together.
+ * This filter should be used on indexing time only.
+ * Example field definition in schema.xml:
+ * <pre>
+ * <fieldtype name="text" class="solr.TextField"
positionIncrementGap="100">
+ * <analyzer type="index">
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true"
expand="false"/>
+ * <filter class="solr.StopFilterFactory"
ignoreCase="true"/>
+ * <filter class="solr.HyphenatedWordsFilterFactory"/>
+ * <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"
catenateWords="1" catenateNumbers="1"
catenateAll="0"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ * </analyzer>
+ * <analyzer type="query">
+ * <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ * <filter class="solr.SynonymFilterFactory"
synonyms="synonyms.txt" ignoreCase="true"
expand="true"/>
+ * <filter class="solr.StopFilterFactory"
ignoreCase="true"/>
+ * <filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1"
catenateWords="0" catenateNumbers="0"
catenateAll="0"/>
+ * <filter class="solr.LowerCaseFilterFactory"/>
+ * <filter
class="solr.RemoveDuplicatesTokenFilterFactory"/>
+ * </analyzer>
+ * </fieldtype>
+ * </pre>
+ */
+public final class HyphenatedWordsFilter extends TokenFilter {
+
+ private final TermAttribute termAttribute = addAttribute( TermAttribute.class );
+ private final OffsetAttribute offsetAttribute = addAttribute( OffsetAttribute.class );
+
+ private final StringBuilder hyphenated = new StringBuilder();
+ private State savedState;
+
+ /**
+ * Creates a new HyphenatedWordsFilter
+ *
+ * @param in TokenStream that will be filtered
+ */
+ public HyphenatedWordsFilter(TokenStream in) {
+ super( in );
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public boolean incrementToken() throws IOException {
+ while ( input.incrementToken() ) {
+ char[] term = termAttribute.termBuffer();
+ int termLength = termAttribute.termLength();
+
+ if ( termLength > 0 && term[termLength - 1] == '-' ) {
+ // a hyphenated word
+ // capture the state of the first token only
+ if ( savedState == null ) {
+ savedState = captureState();
+ }
+ hyphenated.append( term, 0, termLength - 1 );
+ }
+ else if ( savedState == null ) {
+ // not part of a hyphenated word.
+ return true;
+ }
+ else {
+ // the final portion of a hyphenated word
+ hyphenated.append( term, 0, termLength );
+ unhyphenate();
+ return true;
+ }
+ }
+
+ if ( savedState != null ) {
+ // the final term ends with a hyphen
+ // add back the hyphen, for backwards compatibility.
+ hyphenated.append( '-' );
+ unhyphenate();
+ return true;
+ }
+
+ return false;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ hyphenated.setLength( 0 );
+ savedState = null;
+ }
+
+ // ================================================= Helper Methods
================================================
+
+ /**
+ * Writes the joined unhyphenated term
+ */
+ private void unhyphenate() {
+ int endOffset = offsetAttribute.endOffset();
+
+ restoreState( savedState );
+ savedState = null;
+
+ char term[] = termAttribute.termBuffer();
+ int length = hyphenated.length();
+ if ( length > termAttribute.termLength() ) {
+ term = termAttribute.resizeTermBuffer( length );
+ }
+
+ hyphenated.getChars( 0, length, term, 0 );
+ termAttribute.setTermLength( length );
+ offsetAttribute.setOffset( offsetAttribute.startOffset(), endOffset );
+ hyphenated.setLength( 0 );
+ }
+}
Property changes on:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilter.java
___________________________________________________________________
Name: svn:executable
+ *
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for HyphenatedWordsFilter
+ */
+public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory {
+ public HyphenatedWordsFilter create(TokenStream input) {
+ return new HyphenatedWordsFilter( input );
+ }
+}
Property changes on:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java
___________________________________________________________________
Name: svn:executable
+ *
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ISOLatin1AccentFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.ISOLatin1AccentFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for ISOLatin1AccentFilter
+ * $Id: ISOLatin1AccentFilterFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class ISOLatin1AccentFilterFactory extends BaseTokenFilterFactory {
+ public ISOLatin1AccentFilter create(TokenStream input) {
+ return new ISOLatin1AccentFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,65 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A TokenFilter that only keeps tokens with text contained in the
+ * required words. This filter behaves like the inverse of StopFilter.
+ *
+ * @version $Id: KeepWordFilter.java 940806 2010-05-04 11:18:46Z uschindler $
+ * @since solr 1.3
+ */
+public final class KeepWordFilter extends TokenFilter {
+ private final CharArraySet words;
+ private final TermAttribute termAtt = addAttribute( TermAttribute.class );
+
+ /**
+ * @deprecated Use {@link #KeepWordFilter(TokenStream, Set, boolean)} instead
+ */
+ @Deprecated
+ public KeepWordFilter(TokenStream in, Set<String> words, boolean ignoreCase) {
+ this( in, new CharArraySet( words, ignoreCase ) );
+ }
+
+ /**
+ * The words set passed to this constructor will be directly used by this filter
+ * and should not be modified,
+ */
+ public KeepWordFilter(TokenStream in, CharArraySet words) {
+ super( in );
+ this.words = words;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while ( input.incrementToken() ) {
+ if ( words.contains( termAtt.termBuffer(), 0, termAtt.termLength() ) ) {
+ return true;
+ }
+ }
+ return false;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeepWordFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @version $Id: KeepWordFilterFactory.java 929782 2010-04-01 02:15:27Z rmuir $
+ * @since solr 1.3
+ */
+public class KeepWordFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+
+ private CharArraySet words;
+ private boolean ignoreCase;
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get( "words" );
+ ignoreCase = getBoolean( "ignoreCase", false );
+ if ( wordFiles != null ) {
+ try {
+ words = getWordSet( loader, wordFiles, ignoreCase );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ }
+
+ /**
+ * Set the keep word list.
+ * NOTE: if ignoreCase==true, the words are expected to be lowercase
+ */
+ public void setWords(Set<String> words) {
+ this.words = new CharArraySet( words, ignoreCase );
+ }
+
+ public void setIgnoreCase(boolean ignoreCase) {
+ if ( words != null && this.ignoreCase != ignoreCase ) {
+ words = new CharArraySet( words, ignoreCase );
+ }
+ this.ignoreCase = ignoreCase;
+ }
+
+ public KeepWordFilter create(TokenStream input) {
+ return new KeepWordFilter( input, words );
+ }
+
+ public CharArraySet getWords() {
+ return words;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeywordTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeywordTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/KeywordTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.KeywordTokenizer;
+
+/**
+ * @version $Id: LowerCaseTokenizerFactory.java 382610 2006-03-03 01:43:03Z yonik $
+ */
+public class KeywordTokenizerFactory extends BaseTokenizerFactory {
+ public KeywordTokenizer create(Reader input) {
+ return new KeywordTokenizer( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LengthFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LengthFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LengthFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,43 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.LengthFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id: LengthFilterFactory.java 643465 2008-04-01 16:10:19Z gsingers $
+ */
+public class LengthFilterFactory extends BaseTokenFilterFactory {
+ int min, max;
+ public static final String MIN_KEY = "min";
+ public static final String MAX_KEY = "max";
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ min = Integer.parseInt( args.get( MIN_KEY ) );
+ max = Integer.parseInt( args.get( MAX_KEY ) );
+ }
+
+ public LengthFilter create(TokenStream input) {
+ return new LengthFilter( input, min, max );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LetterTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LetterTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LetterTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.LetterTokenizer;
+
+/**
+ * @version $Id: LetterTokenizerFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class LetterTokenizerFactory extends BaseTokenizerFactory {
+ public LetterTokenizer create(Reader input) {
+ return new LetterTokenizer( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id: LowerCaseFilterFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class LowerCaseFilterFactory extends BaseTokenFilterFactory {
+ public LowerCaseFilter create(TokenStream input) {
+ return new LowerCaseFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+
+/**
+ * @version $Id: LowerCaseTokenizerFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class LowerCaseTokenizerFactory extends BaseTokenizerFactory {
+ public LowerCaseTokenizer create(Reader input) {
+ return new LowerCaseTokenizer( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/MappingCharFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/MappingCharFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/MappingCharFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MappingCharFilter;
+import org.apache.lucene.analysis.NormalizeCharMap;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @version $Id: MappingCharFilterFactory.java 794328 2009-07-15 17:21:04Z shalin $
+ * @since Solr 1.4
+ */
+public class MappingCharFilterFactory extends BaseCharFilterFactory implements
+ ResourceLoaderAware {
+
+ protected NormalizeCharMap normMap;
+ private String mapping;
+
+ public void inform(ResourceLoader loader) {
+ mapping = args.get( "mapping" );
+
+ if ( mapping != null ) {
+ List<String> wlist = null;
+ try {
+ File mappingFile = new File( mapping );
+ if ( mappingFile.exists() ) {
+ wlist = loader.getLines( mapping );
+ }
+ else {
+ List<String> files = StrUtils.splitFileNames( mapping );
+ wlist = new ArrayList<String>();
+ for ( String file : files ) {
+ List<String> lines = loader.getLines( file.trim() );
+ wlist.addAll( lines );
+ }
+ }
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ normMap = new NormalizeCharMap();
+ parseRules( wlist, normMap );
+ }
+ }
+
+ public CharStream create(CharStream input) {
+ return new MappingCharFilter( normMap, input );
+ }
+
+ // "source" => "target"
+ static Pattern p = Pattern.compile(
"\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
+
+ protected void parseRules(List<String> rules, NormalizeCharMap normMap) {
+ for ( String rule : rules ) {
+ Matcher m = p.matcher( rule );
+ if ( !m.find() ) {
+ throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "],
file = " + mapping );
+ }
+ normMap.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
+ }
+ }
+
+ char[] out = new char[256];
+
+ protected String parseString(String s) {
+ int readPos = 0;
+ int len = s.length();
+ int writePos = 0;
+ while ( readPos < len ) {
+ char c = s.charAt( readPos++ );
+ if ( c == '\\' ) {
+ if ( readPos >= len ) {
+ throw new RuntimeException( "Invalid escaped char in [" + s +
"]" );
+ }
+ c = s.charAt( readPos++ );
+ switch ( c ) {
+ case '\\':
+ c = '\\';
+ break;
+ case '"':
+ c = '"';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'u':
+ if ( readPos + 3 >= len ) {
+ throw new RuntimeException( "Invalid escaped char in [" + s +
"]" );
+ }
+ c = ( char ) Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+ readPos += 4;
+ break;
+ }
+ }
+ out[writePos++] = c;
+ }
+ return new String( out, 0, writePos );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,51 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+
+/**
+ * Creates new instances of {@link NGramTokenFilter}.
+ */
+public class NGramFilterFactory extends BaseTokenFilterFactory {
+ private int maxGramSize = 0;
+
+ private int minGramSize = 0;
+
+ /**
+ * Initialize the n-gram min and max sizes and the side from which one should start
tokenizing.
+ */
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ String maxArg = args.get( "maxGramSize" );
+ maxGramSize = ( maxArg != null ? Integer.parseInt( maxArg )
+ : NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE );
+
+ String minArg = args.get( "minGramSize" );
+ minGramSize = ( minArg != null ? Integer.parseInt( minArg )
+ : NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE );
+ }
+
+ public NGramTokenFilter create(TokenStream input) {
+ return new NGramTokenFilter( input, minGramSize, maxGramSize );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,52 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.ngram.NGramTokenizer;
+
+/**
+ * Creates new instances of {@link NGramTokenizer}.
+ */
+public class NGramTokenizerFactory extends BaseTokenizerFactory {
+ private int maxGramSize = 0;
+ private int minGramSize = 0;
+
+ /**
+ * Initializes the n-gram min and max sizes and the side from which one should start
tokenizing.
+ */
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ String maxArg = args.get( "maxGramSize" );
+ maxGramSize = ( maxArg != null ? Integer.parseInt( maxArg ) :
NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE );
+
+ String minArg = args.get( "minGramSize" );
+ minGramSize = ( minArg != null ? Integer.parseInt( minArg ) :
NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE );
+ }
+
+ /**
+ * Creates the {@link TokenStream} of n-grams from the given {@link Reader}.
+ */
+ public NGramTokenizer create(Reader input) {
+ return new NGramTokenizer( input, minGramSize, maxGramSize );
+ }
+}
Property changes on:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NGramTokenizerFactory.java
___________________________________________________________________
Name: svn:executable
+ *
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,40 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
+
+public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory {
+ private float payload;
+ private String typeMatch;
+
+ public void init(Map<String, String> args) {
+ super.init( args );
+ payload = Float.parseFloat( args.get( "payload" ) );
+ typeMatch = args.get( "typeMatch" );
+ }
+
+ public NumericPayloadTokenFilter create(TokenStream input) {
+ return new NumericPayloadTokenFilter( input, payload, typeMatch );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,210 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.BaseCharFilter;
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * CharFilter that uses a regular expression for the target of replace string.
+ * The pattern match will be done in each "block" in char stream.
+ * <p/>
+ * <p>
+ * ex1) source="aa bb aa bb",
pattern="(aa)\\s+(bb)" replacement="$1#$2"<br/>
+ * output="aa#bb aa#bb"
+ * </p>
+ * <p/>
+ * NOTE: If you produce a phrase that has different length to source string
+ * and the field is used for highlighting for a term of the phrase, you will
+ * face a trouble.
+ * <p/>
+ * <p>
+ * ex2) source="aa123bb", pattern="(aa)\\d+(bb)"
replacement="$1 $2"<br/>
+ * output="aa bb"<br/>
+ * and you want to search bb and highlight it, you will get<br/>
+ * highlight snippet="aa1<em>23bb</em>"
+ * </p>
+ *
+ * @version $Id: PatternReplaceCharFilter.java 897357 2010-01-09 00:45:45Z koji $
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilter extends BaseCharFilter {
+
+ private final Pattern pattern;
+ private final String replacement;
+ private final int maxBlockChars;
+ private final String blockDelimiters;
+ public static final int DEFAULT_MAX_BLOCK_CHARS = 10000;
+
+ private LinkedList<Character> buffer;
+ private int nextCharCounter;
+ private char[] blockBuffer;
+ private int blockBufferLength;
+ private String replaceBlockBuffer;
+ private int replaceBlockBufferOffset;
+
+ public PatternReplaceCharFilter(Pattern pattern, String replacement, CharStream in) {
+ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, null, in );
+ }
+
+ public PatternReplaceCharFilter(Pattern pattern, String replacement,
+ int maxBlockChars, CharStream in) {
+ this( pattern, replacement, maxBlockChars, null, in );
+ }
+
+ public PatternReplaceCharFilter(Pattern pattern, String replacement,
+ String blockDelimiters, CharStream in) {
+ this( pattern, replacement, DEFAULT_MAX_BLOCK_CHARS, blockDelimiters, in );
+ }
+
+ public PatternReplaceCharFilter(Pattern pattern, String replacement,
+ int maxBlockChars, String blockDelimiters, CharStream in) {
+ super( in );
+ this.pattern = pattern;
+ this.replacement = replacement;
+ if ( maxBlockChars < 1 ) {
+ throw new IllegalArgumentException( "maxBlockChars should be greater than 0, but
it is " + maxBlockChars );
+ }
+ this.maxBlockChars = maxBlockChars;
+ this.blockDelimiters = blockDelimiters;
+ blockBuffer = new char[maxBlockChars];
+ }
+
+ private boolean prepareReplaceBlock() throws IOException {
+ while ( true ) {
+ if ( replaceBlockBuffer != null && replaceBlockBuffer.length() >
replaceBlockBufferOffset ) {
+ return true;
+ }
+ // prepare block buffer
+ blockBufferLength = 0;
+ while ( true ) {
+ int c = nextChar();
+ if ( c == -1 ) {
+ break;
+ }
+ blockBuffer[blockBufferLength++] = ( char ) c;
+ // end of block?
+ boolean foundDelimiter =
+ ( blockDelimiters != null ) &&
+ ( blockDelimiters.length() > 0 ) &&
+ blockDelimiters.indexOf( c ) >= 0;
+ if ( foundDelimiter ||
+ blockBufferLength >= maxBlockChars ) {
+ break;
+ }
+ }
+ // block buffer available?
+ if ( blockBufferLength == 0 ) {
+ return false;
+ }
+ replaceBlockBuffer = getReplaceBlock( blockBuffer, 0, blockBufferLength );
+ replaceBlockBufferOffset = 0;
+ }
+ }
+
+ public int read() throws IOException {
+ while ( prepareReplaceBlock() ) {
+ return replaceBlockBuffer.charAt( replaceBlockBufferOffset++ );
+ }
+ return -1;
+ }
+
+ public int read(char[] cbuf, int off, int len) throws IOException {
+ char[] tmp = new char[len];
+ int l = input.read( tmp, 0, len );
+ if ( l != -1 ) {
+ for ( int i = 0; i < l; i++ ) {
+ pushLastChar( tmp[i] );
+ }
+ }
+ l = 0;
+ for ( int i = off; i < off + len; i++ ) {
+ int c = read();
+ if ( c == -1 ) {
+ break;
+ }
+ cbuf[i] = ( char ) c;
+ l++;
+ }
+ return l == 0 ? -1 : l;
+ }
+
+ private int nextChar() throws IOException {
+ if ( buffer != null && !buffer.isEmpty() ) {
+ nextCharCounter++;
+ return buffer.removeFirst().charValue();
+ }
+ int c = input.read();
+ if ( c != -1 ) {
+ nextCharCounter++;
+ }
+ return c;
+ }
+
+ private void pushLastChar(int c) {
+ if ( buffer == null ) {
+ buffer = new LinkedList<Character>();
+ }
+ buffer.addLast( new Character( ( char ) c ) );
+ }
+
+ String getReplaceBlock(String block) {
+ char[] blockChars = block.toCharArray();
+ return getReplaceBlock( blockChars, 0, blockChars.length );
+ }
+
+ String getReplaceBlock(char block[], int offset, int length) {
+ StringBuffer replaceBlock = new StringBuffer();
+ String sourceBlock = new String( block, offset, length );
+ Matcher m = pattern.matcher( sourceBlock );
+ int lastMatchOffset = 0, lastDiff = 0;
+ while ( m.find() ) {
+ m.appendReplacement( replaceBlock, replacement );
+ // record cumulative diff for the offset correction
+ int diff = replaceBlock.length() - lastMatchOffset - lastDiff - ( m.end( 0 ) -
lastMatchOffset );
+ if ( diff != 0 ) {
+ int prevCumulativeDiff = getLastCumulativeDiff();
+ if ( diff > 0 ) {
+ for ( int i = 0; i < diff; i++ ) {
+ addOffCorrectMap(
+ nextCharCounter - length + m.end( 0 ) + i - prevCumulativeDiff,
+ prevCumulativeDiff - 1 - i
+ );
+ }
+ }
+ else {
+ addOffCorrectMap(
+ nextCharCounter - length + m.end( 0 ) + diff - prevCumulativeDiff,
+ prevCumulativeDiff - diff
+ );
+ }
+ }
+ // save last offsets
+ lastMatchOffset = m.end( 0 );
+ lastDiff = diff;
+ }
+ // copy remaining of the part of source block
+ m.appendTail( replaceBlock );
+ return replaceBlock.toString();
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,60 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.lucene.analysis.CharStream;
+
+/**
+ * @version $Id: PatternReplaceCharFilterFactory.java 897357 2010-01-09 00:45:45Z koji $
+ * @since Solr 1.5
+ */
+public class PatternReplaceCharFilterFactory extends BaseCharFilterFactory {
+
+ private Pattern p;
+ private String replacement;
+ private int maxBlockChars;
+ private String blockDelimiters;
+
+ public void init(Map<String, String> args) {
+ super.init( args );
+ try {
+ p = Pattern.compile( args.get( "pattern" ) );
+ }
+ catch ( PatternSyntaxException e ) {
+ throw new RuntimeException
+ (
+ "Configuration Error: 'pattern' can not be parsed in " +
+ this.getClass().getName(), e
+ );
+ }
+ replacement = args.get( "replacement" );
+ if ( replacement == null ) {
+ replacement = "";
+ }
+ maxBlockChars = getInt( "maxBlockChars",
PatternReplaceCharFilter.DEFAULT_MAX_BLOCK_CHARS );
+ blockDelimiters = args.get( "blockDelimiters" );
+ }
+
+ public CharStream create(CharStream input) {
+ return new PatternReplaceCharFilter( p, replacement, maxBlockChars, blockDelimiters,
input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.nio.CharBuffer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * A TokenFilter which applies a Pattern to each token in the stream,
+ * replacing match occurances with the specified replacement string.
+ * <p/>
+ * <p>
+ * <b>Note:</b> Depending on the input and the pattern used and the input
+ * TokenStream, this TokenFilter may produce Tokens whose text is the empty
+ * string.
+ * </p>
+ *
+ * @version $Id:$
+ * @see Pattern
+ */
+public final class PatternReplaceFilter extends TokenFilter {
+ private final Pattern p;
+ private final String replacement;
+ private final boolean all;
+ private final TermAttribute termAtt;
+
+ /**
+ * Constructs an instance to replace either the first, or all occurances
+ *
+ * @param in the TokenStream to process
+ * @param p the patterm to apply to each Token
+ * @param replacement the "replacement string" to substitute, if null a
+ * blank string will be used. Note that this is not the literal
+ * string that will be used, '$' and '\' have special meaning.
+ * @param all if true, all matches will be replaced otherwise just the first match.
+ *
+ * @see Matcher#quoteReplacement
+ */
+ public PatternReplaceFilter(TokenStream in,
+ Pattern p,
+ String replacement,
+ boolean all) {
+ super( in );
+ this.p = p;
+ this.replacement = ( null == replacement ) ? "" : replacement;
+ this.all = all;
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ CharSequence text = CharBuffer.wrap( termAtt.termBuffer(), 0, termAtt.termLength() );
+ Matcher m = p.matcher( text );
+
+ if ( all ) {
+ termAtt.setTermBuffer( m.replaceAll( replacement ) );
+ }
+ else {
+ termAtt.setTermBuffer( m.replaceFirst( replacement ) );
+ }
+
+ return true;
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id:$
+ * @see PatternReplaceFilter
+ */
+public class PatternReplaceFilterFactory extends BaseTokenFilterFactory {
+ Pattern p;
+ String replacement;
+ boolean all = true;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ try {
+ p = Pattern.compile( args.get( "pattern" ) );
+ }
+ catch ( PatternSyntaxException e ) {
+ throw new RuntimeException
+ (
+ "Configuration Error: 'pattern' can not be parsed in " +
+ this.getClass().getName(), e
+ );
+ }
+
+ replacement = args.get( "replacement" );
+
+ String r = args.get( "replace" );
+ if ( null != r ) {
+ if ( r.equals( "all" ) ) {
+ all = true;
+ }
+ else {
+ if ( r.equals( "first" ) ) {
+ all = false;
+ }
+ else {
+ throw new RuntimeException
+ (
+ "Configuration Error: 'replace' must be 'first' or
'all' in "
+ + this.getClass().getName()
+ );
+ }
+ }
+ }
+
+ }
+
+ public PatternReplaceFilter create(TokenStream input) {
+ return new PatternReplaceFilter( input, p, replacement, all );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizer.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizer.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,147 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * This tokenizer uses regex pattern matching to construct distinct tokens
+ * for the input stream. It takes two arguments: "pattern" and
"group".
+ * <p/>
+ * <ul>
+ * <li>"pattern" is the regular expression.</li>
+ * <li>"group" says which group to extract into tokens.</li>
+ * </ul>
+ * <p>
+ * group=-1 (the default) is equivalent to "split". In this case, the tokens
will
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
+ * </p>
+ * <p>
+ * Using group >= 0 selects the matching group as the token. For example, if you
have:<br/>
+ * <pre>
+ * pattern = \'([^\']+)\'
+ * group = 0
+ * input = aaa 'bbb' 'ccc'
+ * </pre>
+ * the output will be two tokens: 'bbb' and 'ccc' (including the '
marks). With the same input
+ * but using group=1, the output would be: bbb and ccc (no ' marks)
+ * </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero
length.</p>
+ *
+ * @version $Id: PatternTokenizer.java 892217 2009-12-18 10:43:40Z shalin $
+ * @see Pattern
+ */
+public final class PatternTokenizer extends Tokenizer {
+
+ private final TermAttribute termAtt = ( TermAttribute ) addAttribute(
TermAttribute.class );
+ private final OffsetAttribute offsetAtt = ( OffsetAttribute ) addAttribute(
OffsetAttribute.class );
+
+ private String str;
+ private int index;
+
+ private final Pattern pattern;
+ private final int group;
+ private final Matcher matcher;
+
+ /**
+ * creates a new PatternTokenizer returning tokens from group (-1 for split
functionality)
+ */
+ public PatternTokenizer(Reader input, Pattern pattern, int group) throws IOException {
+ super( input );
+ this.pattern = pattern;
+ this.group = group;
+ str = IOUtils.toString( input );
+ matcher = pattern.matcher( str );
+ index = 0;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( index >= str.length() ) {
+ return false;
+ }
+ clearAttributes();
+ if ( group >= 0 ) {
+
+ // match a specific group
+ while ( matcher.find() ) {
+ final String match = matcher.group( group );
+ if ( match.length() == 0 ) {
+ continue;
+ }
+ termAtt.setTermBuffer( match );
+ index = matcher.start( group );
+ offsetAtt.setOffset( correctOffset( index ), correctOffset( matcher.end( group ) )
);
+ return true;
+ }
+
+ index = Integer.MAX_VALUE; // mark exhausted
+ return false;
+
+ }
+ else {
+
+ // String.split() functionality
+ while ( matcher.find() ) {
+ if ( matcher.start() - index > 0 ) {
+ // found a non-zero-length token
+ termAtt.setTermBuffer( str, index, matcher.start() - index );
+ offsetAtt.setOffset( correctOffset( index ), correctOffset( matcher.start() ) );
+ index = matcher.end();
+ return true;
+ }
+
+ index = matcher.end();
+ }
+
+ if ( str.length() - index == 0 ) {
+ index = Integer.MAX_VALUE; // mark exhausted
+ return false;
+ }
+
+ termAtt.setTermBuffer( str, index, str.length() - index );
+ offsetAtt.setOffset( correctOffset( index ), correctOffset( str.length() ) );
+ index = Integer.MAX_VALUE; // mark exhausted
+ return true;
+ }
+ }
+
+ @Override
+ public void end() throws IOException {
+ final int ofs = correctOffset( str.length() );
+ offsetAtt.setOffset( ofs, ofs );
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset( input );
+ str = IOUtils.toString( input );
+ matcher.reset( str );
+ index = 0;
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PatternTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,168 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.solr.common.SolrException;
+
+
+/**
+ * This tokenizer uses regex pattern matching to construct distinct tokens
+ * for the input stream. It takes two arguments: "pattern" and
"group".
+ * <p/>
+ * <ul>
+ * <li>"pattern" is the regular expression.</li>
+ * <li>"group" says which group to extract into tokens.</li>
+ * </ul>
+ * <p>
+ * group=-1 (the default) is equivalent to "split". In this case, the tokens
will
+ * be equivalent to the output from (without empty tokens):
+ * {@link String#split(java.lang.String)}
+ * </p>
+ * <p>
+ * Using group >= 0 selects the matching group as the token. For example, if you
have:<br/>
+ * <pre>
+ * pattern = \'([^\']+)\'
+ * group = 0
+ * input = aaa 'bbb' 'ccc'
+ * </pre>
+ * the output will be two tokens: 'bbb' and 'ccc' (including the '
marks). With the same input
+ * but using group=1, the output would be: bbb and ccc (no ' marks)
+ * </p>
+ * <p>NOTE: This Tokenizer does not output tokens that are of zero
length.</p>
+ *
+ * @version $Id:$
+ * @see PatternTokenizer
+ * @since solr1.2
+ */
+public class PatternTokenizerFactory extends BaseTokenizerFactory {
+ public static final String PATTERN = "pattern";
+ public static final String GROUP = "group";
+
+ protected Map<String, String> args;
+ protected Pattern pattern;
+ protected int group;
+
+ /**
+ * Require a configured pattern
+ */
+ @Override
+ public void init(Map<String, String> args) {
+ this.args = args;
+ String regex = args.get( PATTERN );
+ if ( regex == null ) {
+ throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "missing required
argument: " + PATTERN );
+ }
+ int flags = 0; // TODO? -- read flags from config CASE_INSENSITIVE, etc
+ pattern = Pattern.compile( regex, flags );
+
+ group = -1; // use 'split'
+ String g = args.get( GROUP );
+ if ( g != null ) {
+ try {
+ group = Integer.parseInt( g );
+ }
+ catch ( Exception ex ) {
+ throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, "invalid group
argument: " + g );
+ }
+ }
+ }
+
+ /**
+ * Split the input using configured pattern
+ */
+ public Tokenizer create(final Reader in) {
+ try {
+ return new PatternTokenizer( in, pattern, group );
+ }
+ catch ( IOException ex ) {
+ throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, ex );
+ }
+ }
+
+ /**
+ * This behaves just like String.split( ), but returns a list of Tokens
+ * rather then an array of strings
+ * NOTE: This method is not used in 1.4.
+ *
+ * @deprecated
+ */
+ @Deprecated
+ public static List<Token> split(Matcher matcher, String input) {
+ int index = 0;
+ int lastNonEmptySize = Integer.MAX_VALUE;
+ ArrayList<Token> matchList = new ArrayList<Token>();
+
+ // Add segments before each match found
+ while ( matcher.find() ) {
+ String match = input.subSequence( index, matcher.start() ).toString();
+ matchList.add( new Token( match, index, matcher.start() ) );
+ index = matcher.end();
+ if ( match.length() > 0 ) {
+ lastNonEmptySize = matchList.size();
+ }
+ }
+
+ // If no match is found, return the full string
+ if ( index == 0 ) {
+ matchList.add( new Token( input, 0, input.length() ) );
+ }
+ else {
+ String match = input.subSequence( index, input.length() ).toString();
+ matchList.add( new Token( match, index, input.length() ) );
+ if ( match.length() > 0 ) {
+ lastNonEmptySize = matchList.size();
+ }
+ }
+
+ // Don't use trailing empty strings. This behavior matches String.split();
+ if ( lastNonEmptySize < matchList.size() ) {
+ return matchList.subList( 0, lastNonEmptySize );
+ }
+ return matchList;
+ }
+
+ /**
+ * Create tokens from the matches in a matcher
+ * NOTE: This method is not used in 1.4.
+ *
+ * @deprecated
+ */
+ @Deprecated
+ public static List<Token> group(Matcher matcher, String input, int group) {
+ ArrayList<Token> matchList = new ArrayList<Token>();
+ while ( matcher.find() ) {
+ Token t = new Token(
+ matcher.group( group ),
+ matcher.start( group ),
+ matcher.end( group )
+ );
+ matchList.add( t );
+ }
+ return matchList;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PersianNormalizationFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.fa.PersianNormalizationFilter;
+
+public class PersianNormalizationFilterFactory extends BaseTokenFilterFactory {
+ public PersianNormalizationFilter create(TokenStream input) {
+ return new PersianNormalizationFilter( input );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.commons.codec.Encoder;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Create tokens for phonetic matches. See:
+ *
http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/co...
+ *
+ * @version $Id: PhoneticFilter.java 804726 2009-08-16 17:28:58Z yonik $
+ */
+public class PhoneticFilter extends TokenFilter {
+ protected boolean inject = true;
+ protected Encoder encoder = null;
+ protected String name = null;
+
+ protected State save = null;
+ private final TermAttribute termAtt;
+ private final PositionIncrementAttribute posAtt;
+
+ public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
+ super( in );
+ this.encoder = encoder;
+ this.name = name;
+ this.inject = inject;
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ this.posAtt = ( PositionIncrementAttribute ) addAttribute(
PositionIncrementAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( save != null ) {
+ // clearAttributes(); // not currently necessary
+ restoreState( save );
+ save = null;
+ return true;
+ }
+
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ // pass through zero-length terms
+ if ( termAtt.termLength() == 0 ) {
+ return true;
+ }
+
+ String value = termAtt.term();
+ String phonetic = null;
+ try {
+ String v = encoder.encode( value ).toString();
+ if ( v.length() > 0 && !value.equals( v ) ) {
+ phonetic = v;
+ }
+ }
+ catch ( Exception ignored ) {
+ } // just use the direct text
+
+ if ( phonetic == null ) {
+ return true;
+ }
+
+ if ( !inject ) {
+ // just modify this token
+ termAtt.setTermBuffer( phonetic );
+ return true;
+ }
+
+ // We need to return both the original and the phonetic tokens.
+ // to avoid a orig=captureState() change_to_phonetic() saved=captureState()
restoreState(orig)
+ // we return the phonetic alternative first
+
+ int origOffset = posAtt.getPositionIncrement();
+ posAtt.setPositionIncrement( 0 );
+ save = captureState();
+
+ posAtt.setPositionIncrement( origOffset );
+ termAtt.setTermBuffer( phonetic );
+ return true;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ save = null;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PhoneticFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,103 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.lang.reflect.Method;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.commons.codec.Encoder;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+
+/**
+ * Create tokens based on phonetic encoders
+ * <p/>
+ *
http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/co...
+ * <p/>
+ * This takes two arguments:
+ * "encoder" required, one of "DoubleMetaphone",
"Metaphone", "Soundex", "RefinedSoundex"
+ * <p/>
+ * "inject" (default=true) add tokens to the stream with the offset=0
+ *
+ * @version $Id: PhoneticFilterFactory.java 764276 2009-04-12 02:24:01Z yonik $
+ * @see PhoneticFilter
+ */
+public class PhoneticFilterFactory extends BaseTokenFilterFactory {
+ public static final String ENCODER = "encoder";
+ public static final String INJECT = "inject"; // boolean
+
+ private static final Map<String, Class<? extends Encoder>> registry;
+
+ static {
+ registry = new HashMap<String, Class<? extends Encoder>>();
+ registry.put( "DoubleMetaphone".toUpperCase(), DoubleMetaphone.class );
+ registry.put( "Metaphone".toUpperCase(), Metaphone.class );
+ registry.put( "Soundex".toUpperCase(), Soundex.class );
+ registry.put( "RefinedSoundex".toUpperCase(), RefinedSoundex.class );
+ }
+
+ protected boolean inject = true;
+ protected String name = null;
+ protected Encoder encoder = null;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+
+ inject = getBoolean( INJECT, true );
+
+ String name = args.get( ENCODER );
+ if ( name == null ) {
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR, "Missing required parameter: " +
ENCODER
+ + " [" + registry.keySet() + "]"
+ );
+ }
+ Class<? extends Encoder> clazz = registry.get( name.toUpperCase() );
+ if ( clazz == null ) {
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR, "Unknown encoder: " + name + "
[" + registry.keySet() + "]"
+ );
+ }
+
+ try {
+ encoder = clazz.newInstance();
+
+ // Try to set the maxCodeLength
+ String v = args.get( "maxCodeLength" );
+ if ( v != null ) {
+ Method setter = encoder.getClass().getMethod( "setMaxCodeLen", int.class
);
+ setter.invoke( encoder, Integer.parseInt( v ) );
+ }
+ }
+ catch ( Exception e ) {
+ throw new SolrException(
+ SolrException.ErrorCode.SERVER_ERROR, "Error initializing: " + name +
"/" + clazz, e, false
+ );
+ }
+ }
+
+ public PhoneticFilter create(TokenStream input) {
+ return new PhoneticFilter( input, encoder, name, inject );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PorterStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PorterStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PorterStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id: PorterStemFilterFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class PorterStemFilterFactory extends BaseTokenFilterFactory {
+ public PorterStemFilter create(TokenStream input) {
+ return new PorterStemFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PositionFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PositionFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/PositionFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.position.PositionFilter;
+
+/**
+ * Set the positionIncrement of all tokens to the "positionIncrement", except
the first return token which retains its
+ * original positionIncrement value. The default positionIncrement value is zero.
+ *
+ * @version $Id: PositionFilterFactory.java 739247 2009-01-30 11:38:23Z shalin $
+ * @see org.apache.lucene.analysis.position.PositionFilter
+ * @since solr 1.4
+ */
+public class PositionFilterFactory extends BaseTokenFilterFactory {
+ private int positionIncrement;
+
+ public void init(Map<String, String> args) {
+ super.init( args );
+ positionIncrement = getInt( "positionIncrement", 0 );
+ }
+
+ public PositionFilter create(TokenStream input) {
+ return new PositionFilter( input, positionIncrement );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.util.ArraysUtils;
+
+/**
+ * A TokenFilter which filters out Tokens at the same position and Term
+ * text as the previous token in the stream.
+ */
+public class RemoveDuplicatesTokenFilter extends BufferedTokenStream {
+ public RemoveDuplicatesTokenFilter(TokenStream input) {
+ super( input );
+ }
+
+ protected Token process(Token t) throws IOException {
+ Token tok = read();
+ while ( tok != null && tok.getPositionIncrement() == 0 ) {
+ if ( null != t ) {
+ write( t );
+ t = null;
+ }
+ boolean dup = false;
+ for ( Token outTok : output() ) {
+ int tokLen = tok.termLength();
+ if ( outTok.termLength() == tokLen && ArraysUtils.equals(
+ outTok.termBuffer(), 0, tok.termBuffer(), 0, tokLen
+ ) ) {
+ dup = true;
+ //continue;;
+ }
+ }
+ if ( !dup ) {
+ write( tok );
+ }
+ tok = read();
+ }
+ if ( tok != null ) {
+ pushBack( tok );
+ }
+ return t;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RemoveDuplicatesTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * @version $Id:$
+ */
+public class RemoveDuplicatesTokenFilterFactory extends BaseTokenFilterFactory {
+ public RemoveDuplicatesTokenFilter create(TokenStream input) {
+ return new RemoveDuplicatesTokenFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReverseStringFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReverseStringFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReverseStringFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,34 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+
+/**
+ * A FilterFactory which reverses the input.
+ *
+ * @version $Id: ReverseStringFilterFactory.java 764291 2009-04-12 11:03:09Z shalin $
+ * @since solr 1.4
+ */
+public class ReverseStringFilterFactory extends BaseTokenFilterFactory {
+ public ReverseStringFilter create(TokenStream in) {
+ return new ReverseStringFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,159 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * This class produces a special form of reversed tokens, suitable for
+ * better handling of leading wildcards. Tokens from the input TokenStream
+ * are reversed and prepended with a special "reversed" marker character.
+ * If <code>withOriginal<code> argument is <code>true</code> then
first the
+ * original token is returned, and then the reversed token (with
+ * <code>positionIncrement == 0</code>) is returned. Otherwise only reversed
+ * tokens are returned.
+ * <p>Note: this filter doubles the number of tokens in the input stream when
+ * <code>withOriginal == true</code>, which proportionally increases the
size
+ * of postings and term dictionary in the index.
+ */
+public class ReversedWildcardFilter extends TokenFilter {
+
+ private boolean withOriginal;
+ private char markerChar;
+ private State save;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posAtt;
+
+ protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char
markerChar) {
+ super( input );
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ this.posAtt = ( PositionIncrementAttribute ) addAttribute(
PositionIncrementAttribute.class );
+ this.withOriginal = withOriginal;
+ this.markerChar = markerChar;
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( save != null ) {
+ // clearAttributes(); // not currently necessary
+ restoreState( save );
+ save = null;
+ return true;
+ }
+
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ // pass through zero-length terms
+ int oldLen = termAtt.termLength();
+ if ( oldLen == 0 ) {
+ return true;
+ }
+ int origOffset = posAtt.getPositionIncrement();
+ if ( withOriginal == true ) {
+ posAtt.setPositionIncrement( 0 );
+ save = captureState();
+ }
+ char[] buffer = termAtt.resizeTermBuffer( oldLen + 1 );
+ buffer[oldLen] = markerChar;
+ reverse( buffer, 0, oldLen + 1 );
+
+ posAtt.setPositionIncrement( origOffset );
+ termAtt.setTermBuffer( buffer, 0, oldLen + 1 );
+ return true;
+ }
+
+
+ /**
+ * Partially reverses the given input buffer in-place from the given offset
+ * up to the given length, keeping surrogate pairs in the correct (non-reversed) order.
+ *
+ * @param buffer the input char array to reverse
+ * @param start the offset from where to reverse the buffer
+ * @param len the length in the buffer up to where the
+ * buffer should be reversed
+ */
+ public static void reverse(final char[] buffer, final int start, final int len) {
+ /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
+ if ( len < 2 ) {
+ return;
+ }
+ int end = ( start + len ) - 1;
+ char frontHigh = buffer[start];
+ char endLow = buffer[end];
+ boolean allowFrontSur = true, allowEndSur = true;
+ final int mid = start + ( len >> 1 );
+ for ( int i = start; i < mid; ++i, --end ) {
+ final char frontLow = buffer[i + 1];
+ final char endHigh = buffer[end - 1];
+ final boolean surAtFront = allowFrontSur
+ && Character.isSurrogatePair( frontHigh, frontLow );
+ if ( surAtFront && ( len < 3 ) ) {
+ // nothing to do since surAtFront is allowed and 1 char left
+ return;
+ }
+ final boolean surAtEnd = allowEndSur
+ && Character.isSurrogatePair( endHigh, endLow );
+ allowFrontSur = allowEndSur = true;
+ if ( surAtFront == surAtEnd ) {
+ if ( surAtFront ) {
+ // both surrogates
+ buffer[end] = frontLow;
+ buffer[--end] = frontHigh;
+ buffer[i] = endHigh;
+ buffer[++i] = endLow;
+ frontHigh = buffer[i + 1];
+ endLow = buffer[end - 1];
+ }
+ else {
+ // neither surrogates
+ buffer[end] = frontHigh;
+ buffer[i] = endLow;
+ frontHigh = frontLow;
+ endLow = endHigh;
+ }
+ }
+ else {
+ if ( surAtFront ) {
+ // surrogate only at the front
+ buffer[end] = frontLow;
+ buffer[i] = endLow;
+ endLow = endHigh;
+ allowFrontSur = false;
+ }
+ else {
+ // surrogate only at the end
+ buffer[end] = frontHigh;
+ buffer[i] = endHigh;
+ frontHigh = frontLow;
+ allowEndSur = false;
+ }
+ }
+ }
+ if ( ( len & 0x01 ) == 1 && !( allowFrontSur && allowEndSur ) ) {
+ // only if odd length
+ buffer[end] = allowFrontSur ? endLow : frontHigh;
+ }
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,137 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+
+/**
+ * Factory for {@link ReversedWildcardFilter}-s. When this factory is
+ * added to an analysis chain, it will be used both for filtering the
+ * tokens during indexing, and to determine the query processing of
+ * this field during search.
+ * <p>This class supports the following init arguments:
+ * <ul>
+ * <li><code>withOriginal</code> - if true, then produce both original
and reversed tokens at
+ * the same positions. If false, then produce only reversed tokens.</li>
+ * <li><code>maxPosAsterisk</code> - maximum position (1-based) of the
asterisk wildcard
+ * ('*') that triggers the reversal of query term. Asterisk that occurs at
+ * positions higher than this value will not cause the reversal of query term.
+ * Defaults to 2, meaning that asterisks on positions 1 and 2 will cause
+ * a reversal.</li>
+ * <li><code>maxPosQuestion</code> - maximum position (1-based) of the
question
+ * mark wildcard ('?') that triggers the reversal of query term. Defaults to 1.
+ * Set this to 0, and <code>maxPosAsterisk</code> to 1 to reverse only
+ * pure suffix queries (i.e. ones with a single leading asterisk).</li>
+ * <li><code>maxFractionAsterisk</code> - additional parameter that
+ * triggers the reversal if asterisk ('*') position is less than this
+ * fraction of the query token length. Defaults to 0.0f (disabled).</li>
+ * <li><code>minTrailing</code> - minimum number of trailing characters
in query
+ * token after the last wildcard character. For good performance this should be
+ * set to a value larger than 1. Defaults to 2.
+ * </ul>
+ * Note 1: This filter always reverses input tokens during indexing.
+ * Note 2: Query tokens without wildcard characters will never be reversed.
+ */
+public class ReversedWildcardFilterFactory extends BaseTokenFilterFactory {
+
+ private char markerChar = ReverseStringFilter.START_OF_HEADING_MARKER;
+ private boolean withOriginal;
+ private int maxPosAsterisk;
+ private int maxPosQuestion;
+ private int minTrailing;
+ private float maxFractionAsterisk;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ withOriginal = getBoolean( "withOriginal", true );
+ maxPosAsterisk = getInt( "maxPosAsterisk", 2 );
+ maxPosQuestion = getInt( "maxPosQuestion", 1 );
+ minTrailing = getInt( "minTrailing", 2 );
+ maxFractionAsterisk = getFloat( "maxFractionAsterisk", 0.0f );
+ }
+
+
+ public TokenStream create(TokenStream input) {
+ return new ReversedWildcardFilter( input, withOriginal, markerChar );
+ }
+
+ /**
+ * This method encapsulates the logic that determines whether
+ * a query token should be reversed in order to use the
+ * reversed terms in the index.
+ *
+ * @param token input token.
+ *
+ * @return true if input token should be reversed, false otherwise.
+ */
+ public boolean shouldReverse(String token) {
+ int posQ = token.indexOf( '?' );
+ int posA = token.indexOf( '*' );
+ if ( posQ == -1 && posA == -1 ) { // not a wildcard query
+ return false;
+ }
+ int pos;
+ int lastPos;
+ int len = token.length();
+ lastPos = token.lastIndexOf( '?' );
+ pos = token.lastIndexOf( '*' );
+ if ( pos > lastPos ) {
+ lastPos = pos;
+ }
+ if ( posQ != -1 ) {
+ pos = posQ;
+ if ( posA != -1 ) {
+ pos = Math.min( posQ, posA );
+ }
+ }
+ else {
+ pos = posA;
+ }
+ if ( len - lastPos < minTrailing ) { // too few trailing chars
+ return false;
+ }
+ if ( posQ != -1 && posQ < maxPosQuestion ) { // leading '?'
+ return true;
+ }
+ if ( posA != -1 && posA < maxPosAsterisk ) { // leading '*'
+ return true;
+ }
+ // '*' in the leading part
+ if ( maxFractionAsterisk > 0.0f && pos < ( float ) token.length() *
maxFractionAsterisk ) {
+ return true;
+ }
+ return false;
+ }
+
+ public char getMarkerChar() {
+ return markerChar;
+ }
+
+ protected float getFloat(String name, float defValue) {
+ String val = args.get( name );
+ if ( val == null ) {
+ return defValue;
+ }
+ else {
+ return Float.parseFloat( val );
+ }
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLetterTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.ru.RussianLetterTokenizer;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+/**
+ * @deprecated Use {@link StandardTokenizerFactory} instead.
+ * This tokenizer has no Russian-specific functionality.
+ */
+@Deprecated
+public class RussianLetterTokenizerFactory extends BaseTokenizerFactory {
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ if ( args.containsKey( "charset" ) ) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "The charset parameter is no longer supported. "
+ + "Please process your documents as Unicode instead."
+ );
+ }
+ }
+
+ public RussianLetterTokenizer create(Reader in) {
+ assureMatchVersion();
+ return new RussianLetterTokenizer( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianLowerCaseFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.LowerCaseFilter;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.SolrException.ErrorCode;
+
+/**
+ * @deprecated Use {@link LowerCaseFilterFactory} instead which has the
+ * same functionality.
+ */
+@Deprecated
+public class RussianLowerCaseFilterFactory extends BaseTokenFilterFactory {
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ if ( args.containsKey( "charset" ) ) {
+ throw new SolrException(
+ ErrorCode.SERVER_ERROR,
+ "The charset parameter is no longer supported. "
+ + "Please process your documents as Unicode instead."
+ );
+ }
+ }
+
+ public TokenFilter create(TokenStream in) {
+ // hardcode the version to give exactly the old behavior
+ return new LowerCaseFilter( in );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianStemFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianStemFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/RussianStemFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.snowball.SnowballFilter;
+
+/**
+ * @deprecated Use {@link SnowballPorterFilterFactory} with "Russian" instead,
+ * which has the same functionality.
+ */
+@Deprecated
+public class RussianStemFilterFactory extends BaseTokenFilterFactory {
+
+ public TokenFilter create(TokenStream in) {
+ return new SnowballFilter( in, new org.tartarus.snowball.ext.RussianStemmer() );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ShingleFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ShingleFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ShingleFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.shingle.ShingleFilter;
+
+public class ShingleFilterFactory extends BaseTokenFilterFactory {
+ private int maxShingleSize;
+ private boolean outputUnigrams;
+
+ public void init(Map<String, String> args) {
+ super.init( args );
+ maxShingleSize = getInt(
+ "maxShingleSize",
+ ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE
+ );
+ outputUnigrams = getBoolean( "outputUnigrams", true );
+ }
+
+ public ShingleFilter create(TokenStream input) {
+ ShingleFilter r = new ShingleFilter( input, maxShingleSize );
+ r.setOutputUnigrams( outputUnigrams );
+ return r;
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,144 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+import org.tartarus.snowball.SnowballProgram;
+
+/**
+ * Factory for SnowballFilters, with configurable language
+ * <p/>
+ * Browsing the code, SnowballFilter uses reflection to adapt to Lucene... don't
+ * use this if you are concerned about speed. Use EnglishPorterFilterFactory.
+ *
+ * @version $Id: SnowballPorterFilterFactory.java 804726 2009-08-16 17:28:58Z yonik $
+ */
+public class SnowballPorterFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+ public static final String PROTECTED_TOKENS = "protected";
+
+ private String language = "English";
+ private Class stemClass;
+
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get( PROTECTED_TOKENS );
+ if ( wordFiles != null ) {
+ try {
+ File protectedWordFiles = new File( wordFiles );
+ if ( protectedWordFiles.exists() ) {
+ List<String> wlist = loader.getLines( wordFiles );
+ //This cast is safe in Lucene
+ protectedWords = new CharArraySet(
+ wlist, false
+ );//No need to go through StopFilter as before, since it just uses a List
internally
+ }
+ else {
+ List<String> files = StrUtils.splitFileNames( wordFiles );
+ for ( String file : files ) {
+ List<String> wlist = loader.getLines( file.trim() );
+ if ( protectedWords == null ) {
+ protectedWords = new CharArraySet( wlist, false );
+ }
+ else {
+ protectedWords.addAll( wlist );
+ }
+ }
+ }
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ }
+
+ private CharArraySet protectedWords = null;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ final String cfgLanguage = args.get( "language" );
+ if ( cfgLanguage != null ) {
+ language = cfgLanguage;
+ }
+
+ try {
+ stemClass = Class.forName( "org.tartarus.snowball.ext." + language +
"Stemmer" );
+ }
+ catch ( ClassNotFoundException e ) {
+ throw new RuntimeException( "Can't find class for stemmer language " +
language, e );
+ }
+ }
+
+ public SnowballPorterFilter create(TokenStream input) {
+ SnowballProgram program;
+ try {
+ program = ( SnowballProgram ) stemClass.newInstance();
+ }
+ catch ( Exception e ) {
+ throw new RuntimeException(
+ "Error instantiating stemmer for language " + language + "from class
" + stemClass, e
+ );
+ }
+ return new SnowballPorterFilter( input, program, protectedWords );
+ }
+}
+
+
+class SnowballPorterFilter extends TokenFilter {
+ private final CharArraySet protWords;
+ private final SnowballProgram stemmer;
+ private final TermAttribute termAtt;
+
+ public SnowballPorterFilter(TokenStream source, SnowballProgram stemmer, CharArraySet
protWords) {
+ super( source );
+ this.protWords = protWords;
+ this.stemmer = stemmer;
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ char[] termBuffer = termAtt.termBuffer();
+ int len = termAtt.termLength();
+ // if protected, don't stem. use this to avoid stemming collisions.
+ if ( protWords != null && protWords.contains( termBuffer, 0, len ) ) {
+ return true;
+ }
+
+ stemmer.setCurrent( new String( termBuffer, 0, len ) );//ugh, wish the Stemmer took a
char array
+ stemmer.stem();
+ String newstr = stemmer.getCurrent();
+ termAtt.setTermBuffer( newstr.toCharArray(), 0, newstr.length() );
+
+ return true;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SolrAnalyzer.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SolrAnalyzer.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SolrAnalyzer.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,91 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * @version $Id: SolrAnalyzer.java 804726 2009-08-16 17:28:58Z yonik $
+ */
+public abstract class SolrAnalyzer extends Analyzer {
+ int posIncGap = 0;
+
+ public void setPositionIncrementGap(int gap) {
+ posIncGap = gap;
+ }
+
+ public int getPositionIncrementGap(String fieldName) {
+ return posIncGap;
+ }
+
+ /**
+ * wrap the reader in a CharStream, if appropriate
+ */
+ public Reader charStream(Reader reader) {
+ return reader;
+ }
+
+ @Override
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return getStream( fieldName, reader ).getTokenStream();
+ }
+
+ public static class TokenStreamInfo {
+ private final Tokenizer tokenizer;
+ private final TokenStream tokenStream;
+
+ public TokenStreamInfo(Tokenizer tokenizer, TokenStream tokenStream) {
+ this.tokenizer = tokenizer;
+ this.tokenStream = tokenStream;
+ }
+
+ public Tokenizer getTokenizer() {
+ return tokenizer;
+ }
+
+ public TokenStream getTokenStream() {
+ return tokenStream;
+ }
+ }
+
+
+ public abstract TokenStreamInfo getStream(String fieldName, Reader reader);
+
+ @Override
+ public TokenStream reusableTokenStream(String fieldName, Reader reader) throws
IOException {
+ // if (true) return tokenStream(fieldName, reader);
+ TokenStreamInfo tsi = ( TokenStreamInfo ) getPreviousTokenStream();
+ if ( tsi != null ) {
+ tsi.getTokenizer().reset( charStream( reader ) );
+ // the consumer will currently call reset() on the TokenStream to hit all the
filters.
+ // this isn't necessarily guaranteed by the APIs... but is currently done
+ // by lucene indexing in DocInverterPerField, and in the QueryParser
+ return tsi.getTokenStream();
+ }
+ else {
+ tsi = getStream( fieldName, reader );
+ setPreviousTokenStream( tsi );
+ return tsi.getTokenStream();
+ }
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardFilter;
+
+/**
+ * @version $Id: StandardFilterFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class StandardFilterFactory extends BaseTokenFilterFactory {
+ public StandardFilter create(TokenStream input) {
+ return new StandardFilter( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StandardTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,39 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * @version $Id: StandardTokenizerFactory.java 929782 2010-04-01 02:15:27Z rmuir $
+ */
+
+public class StandardTokenizerFactory extends BaseTokenizerFactory {
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ assureMatchVersion();
+ }
+
+ public StandardTokenizer create(Reader input) {
+ return new StandardTokenizer( luceneMatchVersion, input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StopFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StopFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/StopFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,80 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @version $Id: StopFilterFactory.java 929782 2010-04-01 02:15:27Z rmuir $
+ */
+public class StopFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ assureMatchVersion();
+ }
+
+ public void inform(ResourceLoader loader) {
+ String stopWordFiles = args.get( "words" );
+ ignoreCase = getBoolean( "ignoreCase", false );
+ enablePositionIncrements = getBoolean( "enablePositionIncrements", false );
+
+ if ( stopWordFiles != null ) {
+ try {
+ stopWords = getWordSet( loader, stopWordFiles, ignoreCase );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ else {
+ stopWords = new CharArraySet( StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase );
+ }
+ }
+
+ private CharArraySet stopWords;
+ private boolean ignoreCase;
+ private boolean enablePositionIncrements;
+
+ public boolean isEnablePositionIncrements() {
+ return enablePositionIncrements;
+ }
+
+ public boolean isIgnoreCase() {
+ return ignoreCase;
+ }
+
+ public Set<?> getStopWords() {
+ return stopWords;
+ }
+
+ public StopFilter create(TokenStream input) {
+ StopFilter stopFilter = new StopFilter( enablePositionIncrements, input, stopWords,
ignoreCase );
+ return stopFilter;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,279 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * SynonymFilter handles multi-token synonyms with variable position increment offsets.
+ * <p/>
+ * The matched tokens from the input stream may be optionally passed through
(includeOrig=true)
+ * or discarded. If the original tokens are included, the position increments may be
modified
+ * to retain absolute positions after merging with the synonym tokenstream.
+ * <p/>
+ * Generated synonyms will start at the same position as the first matched source token.
+ *
+ * @version $Id: SynonymFilter.java 991055 2010-08-31 01:40:19Z rmuir $
+ */
+public final class SynonymFilter extends TokenFilter {
+
+ private final SynonymMap map; // Map<String, SynonymMap>
+ private Iterator<AttributeSource> replacement; // iterator over generated tokens
+
+ public SynonymFilter(TokenStream in, SynonymMap map) {
+ super( in );
+ this.map = map;
+ // just ensuring these attributes exist...
+ addAttribute( TermAttribute.class );
+ addAttribute( PositionIncrementAttribute.class );
+ addAttribute( OffsetAttribute.class );
+ addAttribute( TypeAttribute.class );
+ }
+
+
+ /*
+ * Need to worry about multiple scenarios:
+ * - need to go for the longest match
+ * a b => foo #shouldn't match if "a b" is followed by
"c d"
+ * a b c d => bar
+ * - need to backtrack - retry matches for tokens already read
+ * a b c d => foo
+ * b c => bar
+ * If the input stream is "a b c x", one will consume "a b c
d"
+ * trying to match the first rule... all but "a" should be
+ * pushed back so a match may be made on "b c".
+ * - don't try and match generated tokens (thus need separate queue)
+ * matching is not recursive.
+ * - handle optional generation of original tokens in all these cases,
+ * merging token streams to preserve token positions.
+ * - preserve original positionIncrement of first matched token
+ */
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while ( true ) {
+ // if there are any generated tokens, return them... don't try any
+ // matches against them, as we specifically don't want recursion.
+ if ( replacement != null && replacement.hasNext() ) {
+ copy( this, replacement.next() );
+ return true;
+ }
+
+ // common case fast-path of first token not matching anything
+ AttributeSource firstTok = nextTok();
+ if ( firstTok == null ) {
+ return false;
+ }
+ TermAttribute termAtt = firstTok.addAttribute( TermAttribute.class );
+ SynonymMap result = map.submap != null ? map.submap
+ .get( termAtt.termBuffer(), 0, termAtt.termLength() ) : null;
+ if ( result == null ) {
+ copy( this, firstTok );
+ return true;
+ }
+
+ // fast-path failed, clone ourselves if needed
+ if ( firstTok == this ) {
+ firstTok = cloneAttributes();
+ }
+ // OK, we matched a token, so find the longest match.
+
+ matched = new LinkedList<AttributeSource>();
+
+ result = match( result );
+
+ if ( result == null ) {
+ // no match, simply return the first token read.
+ copy( this, firstTok );
+ return true;
+ }
+
+ // reuse, or create new one each time?
+ ArrayList<AttributeSource> generated = new ArrayList<AttributeSource>(
result.synonyms.length + matched.size() + 1 );
+
+ //
+ // there was a match... let's generate the new tokens, merging
+ // in the matched tokens (position increments need adjusting)
+ //
+ AttributeSource lastTok = matched.isEmpty() ? firstTok : matched.getLast();
+ boolean includeOrig = result.includeOrig();
+
+ AttributeSource origTok = includeOrig ? firstTok : null;
+ PositionIncrementAttribute firstPosIncAtt = firstTok.addAttribute(
PositionIncrementAttribute.class );
+ int origPos = firstPosIncAtt.getPositionIncrement(); // position of origTok in the
original stream
+ int repPos = 0; // curr position in replacement token stream
+ int pos = 0; // current position in merged token stream
+
+ for ( int i = 0; i < result.synonyms.length; i++ ) {
+ Token repTok = result.synonyms[i];
+ AttributeSource newTok = firstTok.cloneAttributes();
+ TermAttribute newTermAtt = newTok.addAttribute( TermAttribute.class );
+ OffsetAttribute newOffsetAtt = newTok.addAttribute( OffsetAttribute.class );
+ PositionIncrementAttribute newPosIncAtt = newTok.addAttribute(
PositionIncrementAttribute.class );
+
+ OffsetAttribute lastOffsetAtt = lastTok.addAttribute( OffsetAttribute.class );
+
+ newOffsetAtt.setOffset( newOffsetAtt.startOffset(), lastOffsetAtt.endOffset() );
+ newTermAtt.setTermBuffer( repTok.termBuffer(), 0, repTok.termLength() );
+ repPos += repTok.getPositionIncrement();
+ if ( i == 0 ) {
+ repPos = origPos;
+ } // make position of first token equal to original
+
+ // if necessary, insert original tokens and adjust position increment
+ while ( origTok != null && origPos <= repPos ) {
+ PositionIncrementAttribute origPosInc = origTok.addAttribute(
PositionIncrementAttribute.class );
+ origPosInc.setPositionIncrement( origPos - pos );
+ generated.add( origTok );
+ pos += origPosInc.getPositionIncrement();
+ origTok = matched.isEmpty() ? null : matched.removeFirst();
+ if ( origTok != null ) {
+ origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
+ origPos += origPosInc.getPositionIncrement();
+ }
+ }
+
+ newPosIncAtt.setPositionIncrement( repPos - pos );
+ generated.add( newTok );
+ pos += newPosIncAtt.getPositionIncrement();
+ }
+
+ // finish up any leftover original tokens
+ while ( origTok != null ) {
+ PositionIncrementAttribute origPosInc = origTok.addAttribute(
PositionIncrementAttribute.class );
+ origPosInc.setPositionIncrement( origPos - pos );
+ generated.add( origTok );
+ pos += origPosInc.getPositionIncrement();
+ origTok = matched.isEmpty() ? null : matched.removeFirst();
+ if ( origTok != null ) {
+ origPosInc = origTok.addAttribute( PositionIncrementAttribute.class );
+ origPos += origPosInc.getPositionIncrement();
+ }
+ }
+
+ // what if we replaced a longer sequence with a shorter one?
+ // a/0 b/5 => foo/0
+ // should I re-create the gap on the next buffered token?
+
+ replacement = generated.iterator();
+ // Now return to the top of the loop to read and return the first
+ // generated token.. The reason this is done is that we may have generated
+ // nothing at all, and may need to continue with more matching logic.
+ }
+ }
+
+
+ //
+ // Defer creation of the buffer until the first time it is used to
+ // optimize short fields with no matches.
+ //
+ private LinkedList<AttributeSource> buffer;
+ private LinkedList<AttributeSource> matched;
+
+ private AttributeSource nextTok() throws IOException {
+ if ( buffer != null && !buffer.isEmpty() ) {
+ return buffer.removeFirst();
+ }
+ else {
+ if ( input.incrementToken() ) {
+ return this;
+ }
+ else {
+ return null;
+ }
+ }
+ }
+
+ private void pushTok(AttributeSource t) {
+ if ( buffer == null ) {
+ buffer = new LinkedList<AttributeSource>();
+ }
+ buffer.addFirst( t );
+ }
+
+ private SynonymMap match(SynonymMap map) throws IOException {
+ SynonymMap result = null;
+
+ if ( map.submap != null ) {
+ AttributeSource tok = nextTok();
+ if ( tok != null ) {
+ // clone ourselves.
+ if ( tok == this ) {
+ tok = cloneAttributes();
+ }
+ // check for positionIncrement!=1? if>1, should not match, if==0, check multiple
at this level?
+ TermAttribute termAtt = tok.getAttribute( TermAttribute.class );
+ SynonymMap subMap = map.submap.get( termAtt.termBuffer(), 0, termAtt.termLength() );
+
+ if ( subMap != null ) {
+ // recurse
+ result = match( subMap );
+ }
+
+ if ( result != null ) {
+ matched.addFirst( tok );
+ }
+ else {
+ // push back unmatched token
+ pushTok( tok );
+ }
+ }
+ }
+
+ // if no longer sequence matched, so if this node has synonyms, it's the match.
+ if ( result == null && map.synonyms != null ) {
+ result = map;
+ }
+
+ return result;
+ }
+
+ private void copy(AttributeSource target, AttributeSource source) {
+ if ( target != source ) {
+ if ( source.hasAttributes() ) {
+ State sourceState = source.captureState();
+ target.restoreState( sourceState );
+ }
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ input.reset();
+ replacement = null;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ * @version $Id: SynonymFilterFactory.java 940806 2010-05-04 11:18:46Z uschindler $
+ */
+public class SynonymFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+
+ public void inform(ResourceLoader loader) {
+ String synonyms = args.get( "synonyms" );
+
+ boolean ignoreCase = getBoolean( "ignoreCase", false );
+ boolean expand = getBoolean( "expand", true );
+
+ String tf = args.get( "tokenizerFactory" );
+ TokenizerFactory tokFactory = null;
+ if ( tf != null ) {
+ tokFactory = loadTokenizerFactory( loader, tf, args );
+ }
+
+ if ( synonyms != null ) {
+ List<String> wlist = null;
+ try {
+ File synonymFile = new File( synonyms );
+ if ( synonymFile.exists() ) {
+ wlist = loader.getLines( synonyms );
+ }
+ else {
+ List<String> files = StrUtils.splitFileNames( synonyms );
+ wlist = new ArrayList<String>();
+ for ( String file : files ) {
+ List<String> lines = loader.getLines( file.trim() );
+ wlist.addAll( lines );
+ }
+ }
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ synMap = new SynonymMap( ignoreCase );
+ parseRules( wlist, synMap, "=>", ",", expand, tokFactory );
+ }
+ }
+
+ private SynonymMap synMap;
+
+ static void parseRules(List<String> rules, SynonymMap map, String mappingSep,
+ String synSep, boolean expansion, TokenizerFactory tokFactory) {
+ int count = 0;
+ for ( String rule : rules ) {
+ // To use regexes, we need an expression that specifies an odd number of chars.
+ // This can't really be done with string.split(), and since we need to
+ // do unescaping at some point anyway, we wouldn't be saving any effort
+ // by using regexes.
+
+ List<String> mapping = StrUtils.splitSmart( rule, mappingSep, false );
+
+ List<List<String>> source;
+ List<List<String>> target;
+
+ if ( mapping.size() > 2 ) {
+ throw new RuntimeException( "Invalid Synonym Rule:" + rule );
+ }
+ else if ( mapping.size() == 2 ) {
+ source = getSynList( mapping.get( 0 ), synSep, tokFactory );
+ target = getSynList( mapping.get( 1 ), synSep, tokFactory );
+ }
+ else {
+ source = getSynList( mapping.get( 0 ), synSep, tokFactory );
+ if ( expansion ) {
+ // expand to all arguments
+ target = source;
+ }
+ else {
+ // reduce to first argument
+ target = new ArrayList<List<String>>( 1 );
+ target.add( source.get( 0 ) );
+ }
+ }
+
+ boolean includeOrig = false;
+ for ( List<String> fromToks : source ) {
+ count++;
+ for ( List<String> toToks : target ) {
+ map.add(
+ fromToks,
+ SynonymMap.makeTokens( toToks ),
+ includeOrig,
+ true
+ );
+ }
+ }
+ }
+ }
+
+ // a , b c , d e f => [[a],[b,c],[d,e,f]]
+
+ private static List<List<String>> getSynList(String str, String separator,
TokenizerFactory tokFactory) {
+ List<String> strList = StrUtils.splitSmart( str, separator, false );
+ // now split on whitespace to get a list of token strings
+ List<List<String>> synList = new ArrayList<List<String>>();
+ for ( String toks : strList ) {
+ List<String> tokList = tokFactory == null ?
+ StrUtils.splitWS( toks, true ) : splitByTokenizer( toks, tokFactory );
+ synList.add( tokList );
+ }
+ return synList;
+ }
+
+ private static List<String> splitByTokenizer(String source, TokenizerFactory
tokFactory) {
+ StringReader reader = new StringReader( source );
+ TokenStream ts = loadTokenizer( tokFactory, reader );
+ List<String> tokList = new ArrayList<String>();
+ try {
+ TermAttribute termAtt = ts.addAttribute( TermAttribute.class );
+ while ( ts.incrementToken() ) {
+ if ( termAtt.termLength() > 0 ) {
+ tokList.add( termAtt.toString() );
+ }
+ }
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ finally {
+ reader.close();
+ }
+ return tokList;
+ }
+
+ private static TokenizerFactory loadTokenizerFactory(ResourceLoader loader, String
cname, Map<String, String> args) {
+ TokenizerFactory tokFactory = ( TokenizerFactory ) loader.newInstance( cname );
+ tokFactory.init( args );
+ return tokFactory;
+ }
+
+ private static TokenStream loadTokenizer(TokenizerFactory tokFactory, Reader reader) {
+ return tokFactory.create( reader );
+ }
+
+ public SynonymMap getSynonymMap() {
+ return synMap;
+ }
+
+ public SynonymFilter create(TokenStream input) {
+ return new SynonymFilter( input, synMap );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymMap.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymMap.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/SynonymMap.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,179 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.solr.util.CharArrayMap;
+
+/**
+ * Mapping rules for use with {@link org.apache.solr.analysis.SynonymFilter}
+ *
+ * @version $Id: SynonymMap.java 991055 2010-08-31 01:40:19Z rmuir $
+ */
+public class SynonymMap {
+ CharArrayMap<SynonymMap> submap; // recursive: Map<String, SynonymMap>
+ Token[] synonyms;
+ int flags;
+
+ static final int INCLUDE_ORIG = 0x01;
+ static final int IGNORE_CASE = 0x02;
+
+ public SynonymMap() {
+ }
+
+ public SynonymMap(boolean ignoreCase) {
+ if ( ignoreCase ) {
+ flags |= IGNORE_CASE;
+ }
+ }
+
+ public boolean includeOrig() {
+ return ( flags & INCLUDE_ORIG ) != 0;
+ }
+
+ public boolean ignoreCase() {
+ return ( flags & IGNORE_CASE ) != 0;
+ }
+
+ /**
+ * @param singleMatch List<String>, the sequence of strings to match
+ * @param replacement List<Token> the list of tokens to use on a match
+ * @param includeOrig sets a flag on this mapping signaling the generation of matched
tokens in addition to the replacement tokens
+ * @param mergeExisting merge the replacement tokens with any other mappings that exist
+ */
+ public void add(List<String> singleMatch, List<Token> replacement, boolean
includeOrig, boolean mergeExisting) {
+ SynonymMap currMap = this;
+ for ( String str : singleMatch ) {
+ if ( currMap.submap == null ) {
+ // for now hardcode at 2.9, as its what the old code did.
+ // would be nice to fix, but shouldn't store a version in each submap!!!
+ currMap.submap = new CharArrayMap<SynonymMap>( 1, ignoreCase() );
+ }
+
+ SynonymMap map = currMap.submap.get( str );
+ if ( map == null ) {
+ map = new SynonymMap();
+ map.flags |= flags & IGNORE_CASE;
+ currMap.submap.put( str, map );
+ }
+
+ currMap = map;
+ }
+
+ if ( currMap.synonyms != null && !mergeExisting ) {
+ throw new RuntimeException( "SynonymFilter: there is already a mapping for "
+ singleMatch );
+ }
+ List<Token> superset = currMap.synonyms == null ? replacement :
+ mergeTokens( Arrays.asList( currMap.synonyms ), replacement );
+ currMap.synonyms = ( Token[] ) superset.toArray( new Token[superset.size()] );
+ if ( includeOrig ) {
+ currMap.flags |= INCLUDE_ORIG;
+ }
+ }
+
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder( "<" );
+ if ( synonyms != null ) {
+ sb.append( "[" );
+ for ( int i = 0; i < synonyms.length; i++ ) {
+ if ( i != 0 ) {
+ sb.append( ',' );
+ }
+ sb.append( synonyms[i] );
+ }
+ if ( ( flags & INCLUDE_ORIG ) != 0 ) {
+ sb.append( ",ORIG" );
+ }
+ sb.append( "]," );
+ }
+ sb.append( submap );
+ sb.append( ">" );
+ return sb.toString();
+ }
+
+
+ /**
+ * Produces a List<Token> from a List<String>
+ */
+ public static List<Token> makeTokens(List<String> strings) {
+ List<Token> ret = new ArrayList<Token>( strings.size() );
+ for ( String str : strings ) {
+ //Token newTok = new Token(str,0,0,"SYNONYM");
+ Token newTok = new Token( str, 0, 0, "SYNONYM" );
+ ret.add( newTok );
+ }
+ return ret;
+ }
+
+
+ /**
+ * Merge two lists of tokens, producing a single list with manipulated
positionIncrements so that
+ * the tokens end up at the same position.
+ * <p/>
+ * Example: [a b] merged with [c d] produces [a/b c/d] ('/' denotes tokens in
the same position)
+ * Example: [a,5 b,2] merged with [c d,4 e,4] produces [c a,5/d b,2 e,2] (a,n means a
has posInc=n)
+ */
+ public static List<Token> mergeTokens(List<Token> lst1, List<Token>
lst2) {
+ ArrayList<Token> result = new ArrayList<Token>();
+ if ( lst1 == null || lst2 == null ) {
+ if ( lst2 != null ) {
+ result.addAll( lst2 );
+ }
+ if ( lst1 != null ) {
+ result.addAll( lst1 );
+ }
+ return result;
+ }
+
+ int pos = 0;
+ Iterator<Token> iter1 = lst1.iterator();
+ Iterator<Token> iter2 = lst2.iterator();
+ Token tok1 = iter1.hasNext() ? iter1.next() : null;
+ Token tok2 = iter2.hasNext() ? iter2.next() : null;
+ int pos1 = tok1 != null ? tok1.getPositionIncrement() : 0;
+ int pos2 = tok2 != null ? tok2.getPositionIncrement() : 0;
+ while ( tok1 != null || tok2 != null ) {
+ while ( tok1 != null && ( pos1 <= pos2 || tok2 == null ) ) {
+ Token tok = new Token( tok1.startOffset(), tok1.endOffset(), tok1.type() );
+ tok.setTermBuffer( tok1.termBuffer(), 0, tok1.termLength() );
+ tok.setPositionIncrement( pos1 - pos );
+ result.add( tok );
+ pos = pos1;
+ tok1 = iter1.hasNext() ? iter1.next() : null;
+ pos1 += tok1 != null ? tok1.getPositionIncrement() : 0;
+ }
+ while ( tok2 != null && ( pos2 <= pos1 || tok1 == null ) ) {
+ Token tok = new Token( tok2.startOffset(), tok2.endOffset(), tok2.type() );
+ tok.setTermBuffer( tok2.termBuffer(), 0, tok2.termLength() );
+ tok.setPositionIncrement( pos2 - pos );
+ result.add( tok );
+ pos = pos2;
+ tok2 = iter2.hasNext() ? iter2.next() : null;
+ pos2 += tok2 != null ? tok2.getPositionIncrement() : 0;
+ }
+ }
+ return result;
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ThaiWordFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ThaiWordFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/ThaiWordFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.th.ThaiWordFilter;
+
+public class ThaiWordFilterFactory extends BaseTokenFilterFactory {
+ public ThaiWordFilter create(TokenStream input) {
+ return new ThaiWordFilter( input );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * A <code>TokenFilterFactory</code> creates a
+ * <code>TokenFilter</code> to transform one
<code>TokenStream</code>
+ * into another.
+ * <p/>
+ * <p>
+ * TokenFilterFactories are registered for <code>FieldType</code>s with the
+ * <code>IndexSchema</code> through the <code>schema.xml</code>
file.
+ * </p>
+ * <p>
+ * Example <code>schema.xml</code> entry to register a TokenFilterFactory
+ * implementation to transform tokens in a field of type "cool"
+ * </p>
+ * <pre>
+ * <fieldtype name="cool" class="solr.TextField">
+ * <analyzer>
+ * ...
+ * <filter class="foo.MyTokenFilterFactory"/>
+ * ...
+ * </pre>
+ * <p>
+ * A single instance of any registered TokenFilterFactory is created
+ * via the default constructor and is reused for each FieldType.
+ * </p>
+ *
+ * @version $Id: TokenFilterFactory.java 711737 2008-11-05 23:06:08Z ehatcher $
+ */
+
+public interface TokenFilterFactory {
+ /**
+ * <code>init</code> will be called just once, immediately after creation.
+ * <p>The args are user-level initialization parameters that
+ * may be specified when declaring the factory in the
+ * schema.xml
+ */
+ public void init(Map<String, String> args);
+
+ /**
+ * Accessor method for reporting the args used to initialize this factory.
+ * <p>
+ * Implementations are <strong>strongly</strong> encouraged to return
+ * the contents of the Map passed to to the init method
+ * </p>
+ */
+ public Map<String, String> getArgs();
+
+ /**
+ * Transform the specified input TokenStream
+ */
+ public TokenStream create(TokenStream input);
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter;
+
+public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory {
+ public TokenOffsetPayloadTokenFilter create(TokenStream input) {
+ return new TokenOffsetPayloadTokenFilter( input );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerChain.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerChain.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerChain.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * @version $Id: TokenizerChain.java 805263 2009-08-18 02:50:49Z yonik $
+ */
+
+//
+// An analyzer that uses a tokenizer and a list of token filters to
+// create a TokenStream.
+//
+public class TokenizerChain extends SolrAnalyzer {
+ final private CharFilterFactory[] charFilters;
+ final private TokenizerFactory tokenizer;
+ final private TokenFilterFactory[] filters;
+
+ public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+ this( null, tokenizer, filters );
+ }
+
+ public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer,
TokenFilterFactory[] filters) {
+ this.charFilters = charFilters;
+ this.tokenizer = tokenizer;
+ this.filters = filters;
+ }
+
+ public CharFilterFactory[] getCharFilterFactories() {
+ return charFilters;
+ }
+
+ public TokenizerFactory getTokenizerFactory() {
+ return tokenizer;
+ }
+
+ public TokenFilterFactory[] getTokenFilterFactories() {
+ return filters;
+ }
+
+ @Override
+ public Reader charStream(Reader reader) {
+ if ( charFilters != null && charFilters.length > 0 ) {
+ CharStream cs = CharReader.get( reader );
+ for ( int i = 0; i < charFilters.length; i++ ) {
+ cs = charFilters[i].create( cs );
+ }
+ reader = cs;
+ }
+ return reader;
+ }
+
+ @Override
+ public TokenStreamInfo getStream(String fieldName, Reader reader) {
+ Tokenizer tk = ( Tokenizer ) tokenizer.create( charStream( reader ) );
+ TokenStream ts = tk;
+ for ( int i = 0; i < filters.length; i++ ) {
+ ts = filters[i].create( ts );
+ }
+ return new TokenStreamInfo( tk, ts );
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder( "TokenizerChain(" );
+ for ( CharFilterFactory filter : charFilters ) {
+ sb.append( filter );
+ sb.append( ", " );
+ }
+ sb.append( tokenizer );
+ for ( TokenFilterFactory filter : filters ) {
+ sb.append( ", " );
+ sb.append( filter );
+ }
+ sb.append( ')' );
+ return sb.toString();
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,74 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Tokenizer;
+
+
+/**
+ * A <code>TokenizerFactory</code> breaks up a stream of characters
+ * into tokens.
+ * <p/>
+ * <p>
+ * TokenizerFactories are registered for <code>FieldType</code>s with the
+ * <code>IndexSchema</code> through the <code>schema.xml</code>
file.
+ * </p>
+ * <p>
+ * Example <code>schema.xml</code> entry to register a TokenizerFactory
+ * implementation to tokenize fields of type "cool"
+ * </p>
+ * <pre>
+ * <fieldtype name="cool" class="solr.TextField">
+ * <analyzer>
+ * <tokenizer class="solr.StandardTokenizerFactory"/>
+ * ...
+ * </pre>
+ * <p>
+ * A single instance of any registered TokenizerFactory is created
+ * via the default constructor and is reused for each FieldType.
+ * </p>
+ *
+ * @version $Id: TokenizerFactory.java 807338 2009-08-24 18:58:22Z ryan $
+ */
+public interface TokenizerFactory {
+ /**
+ * <code>init</code> will be called just once, immediately after creation.
+ * <p>The args are user-level initialization parameters that
+ * may be specified when declaring a the factory in the
+ * schema.xml
+ */
+ public void init(Map<String, String> args);
+
+ /**
+ * Accessor method for reporting the args used to initialize this factory.
+ * <p>
+ * Implementations are <strong>strongly</strong> encouraged to return
+ * the contents of the Map passed to to the init method
+ * </p>
+ */
+ public Map<String, String> getArgs();
+
+ /**
+ * Creates a TokenStream of the specified input
+ */
+ public Tokenizer create(Reader input);
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,88 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * Trims leading and trailing whitespace from Tokens in the stream.
+ *
+ * @version $Id:$
+ */
+public final class TrimFilter extends TokenFilter {
+
+ final boolean updateOffsets;
+ private final TermAttribute termAtt;
+ private final OffsetAttribute offsetAtt;
+
+
+ public TrimFilter(TokenStream in, boolean updateOffsets) {
+ super( in );
+ this.updateOffsets = updateOffsets;
+
+ this.termAtt = ( TermAttribute ) addAttribute( TermAttribute.class );
+ this.offsetAtt = ( OffsetAttribute ) addAttribute( OffsetAttribute.class );
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ char[] termBuffer = termAtt.termBuffer();
+ int len = termAtt.termLength();
+ //TODO: Is this the right behavior or should we return false? Currently, "
", returns true, so I think this should
+ //also return true
+ if ( len == 0 ) {
+ return true;
+ }
+ int start = 0;
+ int end = 0;
+ int endOff = 0;
+
+ // eat the first characters
+ //QUESTION: Should we use Character.isWhitespace() instead?
+ for ( start = 0; start < len && termBuffer[start] <= ' '; start++
) {
+ }
+ // eat the end characters
+ for ( end = len; end >= start && termBuffer[end - 1] <= ' ';
end-- ) {
+ endOff++;
+ }
+ if ( start > 0 || end < len ) {
+ if ( start < end ) {
+ termAtt.setTermBuffer( termBuffer, start, ( end - start ) );
+ }
+ else {
+ termAtt.setTermLength( 0 );
+ }
+ if ( updateOffsets ) {
+ int newStart = offsetAtt.startOffset() + start;
+ int newEnd = offsetAtt.endOffset() - ( start < end ? endOff : 0 );
+ offsetAtt.setOffset( newStart, newEnd );
+ }
+ }
+
+ return true;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TrimFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.SolrException;
+
+/**
+ * @version $Id:$
+ * @see TrimFilter
+ */
+public class TrimFilterFactory extends BaseTokenFilterFactory {
+
+ protected boolean updateOffsets = false;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+
+ String v = args.get( "updateOffsets" );
+ if ( v != null ) {
+ try {
+ updateOffsets = Boolean.valueOf( v );
+ }
+ catch ( Exception ex ) {
+ throw new SolrException(
+ SolrException.ErrorCode.BAD_REQUEST,
+ "Error reading updateOffsets value. Must be true or false.",
+ ex
+ );
+ }
+ }
+ }
+
+ public TrimFilter create(TokenStream input) {
+ return new TrimFilter( input, updateOffsets );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
+
+public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory {
+ public TypeAsPayloadTokenFilter create(TokenStream input) {
+ return new TypeAsPayloadTokenFilter( input );
+ }
+}
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,31 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * @version $Id: WhitespaceTokenizerFactory.java 591158 2007-11-01 22:37:42Z hossman $
+ */
+public class WhitespaceTokenizerFactory extends BaseTokenizerFactory {
+ public WhitespaceTokenizer create(Reader input) {
+ return new WhitespaceTokenizer( input );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilter.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilter.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilter.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,894 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Splits words into subwords and performs optional transformations on subword groups.
+ * Words are split into subwords with the following rules:
+ * - split on intra-word delimiters (by default, all non alpha-numeric characters).
+ * - "Wi-Fi" -> "Wi", "Fi"
+ * - split on case transitions
+ * - "PowerShot" -> "Power", "Shot"
+ * - split on letter-number transitions
+ * - "SD500" -> "SD", "500"
+ * - leading and trailing intra-word delimiters on each subword are ignored
+ * - "//hello---there, 'dude'" -> "hello",
"there", "dude"
+ * - trailing "'s" are removed for each subword
+ * - "O'Neil's" -> "O", "Neil"
+ * - Note: this step isn't performed in a separate filter because of possible subword
combinations.
+ * <p/>
+ * The <b>combinations</b> parameter affects how subwords are combined:
+ * - combinations="0" causes no subword combinations.
+ * - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are
the token positions)
+ * - combinations="1" means that in addition to the subwords, maximum runs of
non-numeric subwords are catenated and produced at the same position of the last subword
in the run.
+ * - "PowerShot" -> 0:"Power", 1:"Shot"
1:"PowerShot"
+ * - "A's+B's&C's" -> 0:"A", 1:"B",
2:"C", 2:"ABC"
+ * - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super",
1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500"
4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder"
+ * <p/>
+ * One use for WordDelimiterFilter is to help match words with different subword
delimiters.
+ * For example, if the source text contained "wi-fi" one may want
"wifi" "WiFi" "wi-fi" "wi+fi" queries to all
match.
+ * One way of doing so is to specify combinations="1" in the analyzer used for
indexing, and combinations="0" (the default)
+ * in the analyzer used for querying. Given that the current StandardTokenizer
immediately removes many intra-word
+ * delimiters, it is recommended that this filter be used after a tokenizer that does not
do this (such as WhitespaceTokenizer).
+ *
+ * @version $Id: WordDelimiterFilter.java 940806 2010-05-04 11:18:46Z uschindler $
+ */
+
+final class WordDelimiterFilter extends TokenFilter {
+ public static final String OS_ARCH = System.getProperty("os.arch");
+
+ // NOTE: this logic may not be correct; if you know of a
+ // more reliable approach please raise it on java-dev!
+ public static final boolean JRE_IS_64BIT;
+
+ static {
+ String x = System.getProperty( "sun.arch.data.model" );
+ if ( x != null ) {
+ JRE_IS_64BIT = x.indexOf( "64" ) != -1;
+ }
+ else {
+ if ( OS_ARCH != null && OS_ARCH.indexOf( "64" ) != -1 ) {
+ JRE_IS_64BIT = true;
+ }
+ else {
+ JRE_IS_64BIT = false;
+ }
+ }
+ }
+
+ public static final int LOWER = 0x01;
+ public static final int UPPER = 0x02;
+ public static final int DIGIT = 0x04;
+ public static final int SUBWORD_DELIM = 0x08;
+
+ // combinations: for testing, not for setting bits
+ public static final int ALPHA = 0x03;
+ public static final int ALPHANUM = 0x07;
+
+ /**
+ * If true, causes parts of words to be generated:
+ * <p/>
+ * "PowerShot" => "Power" "Shot"
+ */
+ final boolean generateWordParts;
+
+ /**
+ * If true, causes number subwords to be generated:
+ * <p/>
+ * "500-42" => "500" "42"
+ */
+ final boolean generateNumberParts;
+
+ /**
+ * If true, causes maximum runs of word parts to be catenated:
+ * <p/>
+ * "wi-fi" => "wifi"
+ */
+ final boolean catenateWords;
+
+ /**
+ * If true, causes maximum runs of number parts to be catenated:
+ * <p/>
+ * "500-42" => "50042"
+ */
+ final boolean catenateNumbers;
+
+ /**
+ * If true, causes all subword parts to be catenated:
+ * <p/>
+ * "wi-fi-4000" => "wifi4000"
+ */
+ final boolean catenateAll;
+
+ /**
+ * If true, original words are preserved and added to the subword list (Defaults to
false)
+ * <p/>
+ * "500-42" => "500" "42" "500-42"
+ */
+ final boolean preserveOriginal;
+
+ /**
+ * If not null is the set of tokens to protect from being delimited
+ */
+ final CharArraySet protWords;
+
+ private final TermAttribute termAttribute = addAttribute( TermAttribute.class );
+ private final OffsetAttribute offsetAttribute = addAttribute( OffsetAttribute.class );
+ private final PositionIncrementAttribute posIncAttribute = addAttribute(
PositionIncrementAttribute.class );
+ private final TypeAttribute typeAttribute = addAttribute( TypeAttribute.class );
+
+ // used for iterating word delimiter breaks
+ private final WordDelimiterIterator iterator;
+
+ // used for concatenating runs of similar typed subwords (word,number)
+ private final WordDelimiterConcatenation concat = new WordDelimiterConcatenation();
+ // number of subwords last output by concat.
+ private int lastConcatCount = 0;
+
+ // used for catenate all
+ private final WordDelimiterConcatenation concatAll = new WordDelimiterConcatenation();
+
+ // used for accumulating position increment gaps
+ private int accumPosInc = 0;
+
+ private char savedBuffer[] = new char[1024];
+ private int savedStartOffset;
+ private int savedEndOffset;
+ private String savedType;
+ private boolean hasSavedState = false;
+ // if length by start + end offsets doesn't match the term text then assume
+ // this is a synonym and don't adjust the offsets.
+ private boolean hasIllegalOffsets = false;
+
+ // for a run of the same subword type within a word, have we output anything?
+ private boolean hasOutputToken = false;
+ // when preserve original is on, have we output any token following it?
+ // this token must have posInc=0!
+ private boolean hasOutputFollowingOriginal = false;
+
+ /**
+ * @param in Token stream to be filtered.
+ * @param charTypeTable
+ * @param generateWordParts If 1, causes parts of words to be generated:
"PowerShot" => "Power" "Shot"
+ * @param generateNumberParts If 1, causes number subwords to be generated:
"500-42" => "500" "42"
+ * @param catenateWords 1, causes maximum runs of word parts to be catenated:
"wi-fi" => "wifi"
+ * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated:
"500-42" => "50042"
+ * @param catenateAll If 1, causes all subword parts to be catenated:
"wi-fi-4000" => "wifi4000"
+ * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens;
("Power-Shot" remains two parts regards)
+ * @param preserveOriginal If 1, includes original words in subwords: "500-42"
=> "500" "42" "500-42"
+ * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j"
"2" "se"
+ * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed
for each subword: "O'Neil's" => "O", "Neil"
+ * @param protWords If not null is the set of tokens to protect from being delimited
+ */
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ int stemEnglishPossessive,
+ CharArraySet protWords) {
+ super( in );
+ this.generateWordParts = generateWordParts != 0;
+ this.generateNumberParts = generateNumberParts != 0;
+ this.catenateWords = catenateWords != 0;
+ this.catenateNumbers = catenateNumbers != 0;
+ this.catenateAll = catenateAll != 0;
+ this.preserveOriginal = preserveOriginal != 0;
+ this.protWords = protWords;
+ this.iterator = new WordDelimiterIterator(
+ charTypeTable, splitOnCaseChange != 0, splitOnNumerics != 0, stemEnglishPossessive !=
0
+ );
+ }
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int,
int, int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ CharArraySet protWords) {
+ this(
+ in,
+ charTypeTable,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ splitOnCaseChange,
+ preserveOriginal,
+ 1,
+ 1,
+ null
+ );
+ }
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, byte[], int, int, int, int, int,
int, int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal) {
+ this(
+ in,
+ charTypeTable,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ splitOnCaseChange,
+ preserveOriginal,
+ 1,
+ null
+ );
+ }
+
+ /**
+ * @param in Token stream to be filtered.
+ * @param generateWordParts If 1, causes parts of words to be generated:
"PowerShot", "Power-Shot" => "Power" "Shot"
+ * @param generateNumberParts If 1, causes number subwords to be generated:
"500-42" => "500" "42"
+ * @param catenateWords 1, causes maximum runs of word parts to be catenated:
"wi-fi" => "wifi"
+ * @param catenateNumbers If 1, causes maximum runs of number parts to be catenated:
"500-42" => "50042"
+ * @param catenateAll If 1, causes all subword parts to be catenated:
"wi-fi-4000" => "wifi4000"
+ * @param splitOnCaseChange 1, causes "PowerShot" to be two tokens;
("Power-Shot" remains two parts regards)
+ * @param preserveOriginal If 1, includes original words in subwords: "500-42"
=> "500" "42" "500-42"
+ * @param splitOnNumerics 1, causes "j2se" to be three tokens; "j"
"2" "se"
+ * @param stemEnglishPossessive If 1, causes trailing "'s" to be removed
for each subword: "O'Neil's" => "O", "Neil"
+ * @param protWords If not null is the set of tokens to protect from being delimited
+ */
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ int stemEnglishPossessive,
+ CharArraySet protWords) {
+ this(
+ in,
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ splitOnCaseChange,
+ preserveOriginal,
+ splitOnNumerics,
+ stemEnglishPossessive,
+ protWords
+ );
+ }
+
+ /**
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int,
int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal,
+ int splitOnNumerics,
+ CharArraySet protWords) {
+ this(
+ in,
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ splitOnCaseChange,
+ preserveOriginal,
+ splitOnNumerics,
+ 1,
+ protWords
+ );
+ }
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int,
int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll,
+ int splitOnCaseChange,
+ int preserveOriginal) {
+ this(
+ in,
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ splitOnCaseChange,
+ preserveOriginal
+ );
+ }
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int,
int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ byte[] charTypeTable,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll) {
+ this(
+ in,
+ charTypeTable,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ 1,
+ 0,
+ 1,
+ null
+ );
+ }
+
+ /**
+ * Compatibility constructor
+ *
+ * @deprecated Use
+ * {@link #WordDelimiterFilter(TokenStream, int, int, int, int, int, int,
int, int, int, CharArraySet)}
+ * instead.
+ */
+ @Deprecated
+ public WordDelimiterFilter(TokenStream in,
+ int generateWordParts,
+ int generateNumberParts,
+ int catenateWords,
+ int catenateNumbers,
+ int catenateAll) {
+ this(
+ in,
+ WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE,
+ generateWordParts,
+ generateNumberParts,
+ catenateWords,
+ catenateNumbers,
+ catenateAll,
+ 1,
+ 0,
+ 1,
+ null
+ );
+ }
+
+ public boolean incrementToken() throws IOException {
+ while ( true ) {
+ if ( !hasSavedState ) {
+ // process a new input word
+ if ( !input.incrementToken() ) {
+ return false;
+ }
+
+ int termLength = termAttribute.termLength();
+ char[] termBuffer = termAttribute.termBuffer();
+
+ accumPosInc += posIncAttribute.getPositionIncrement();
+
+ iterator.setText( termBuffer, termLength );
+ iterator.next();
+
+ // word of no delimiters, or protected word: just return it
+ if ( ( iterator.current == 0 && iterator.end == termLength ) ||
+ ( protWords != null && protWords.contains( termBuffer, 0, termLength ) ) )
{
+ posIncAttribute.setPositionIncrement( accumPosInc );
+ accumPosInc = 0;
+ return true;
+ }
+
+ // word of simply delimiters
+ if ( iterator.end == WordDelimiterIterator.DONE && !preserveOriginal ) {
+ // if the posInc is 1, simply ignore it in the accumulation
+ if ( posIncAttribute.getPositionIncrement() == 1 ) {
+ accumPosInc--;
+ }
+ continue;
+ }
+
+ saveState();
+
+ hasOutputToken = false;
+ hasOutputFollowingOriginal = !preserveOriginal;
+ lastConcatCount = 0;
+
+ if ( preserveOriginal ) {
+ posIncAttribute.setPositionIncrement( accumPosInc );
+ accumPosInc = 0;
+ return true;
+ }
+ }
+
+ // at the end of the string, output any concatenations
+ if ( iterator.end == WordDelimiterIterator.DONE ) {
+ if ( !concat.isEmpty() ) {
+ if ( flushConcatenation( concat ) ) {
+ return true;
+ }
+ }
+
+ if ( !concatAll.isEmpty() ) {
+ // only if we haven't output this same combo above!
+ if ( concatAll.subwordCount > lastConcatCount ) {
+ concatAll.writeAndClear();
+ return true;
+ }
+ concatAll.clear();
+ }
+
+ // no saved concatenations, on to the next input word
+ hasSavedState = false;
+ continue;
+ }
+
+ // word surrounded by delimiters: always output
+ if ( iterator.isSingleWord() ) {
+ generatePart( true );
+ iterator.next();
+ return true;
+ }
+
+ int wordType = iterator.type();
+
+ // do we already have queued up incompatible concatenations?
+ if ( !concat.isEmpty() && ( concat.type & wordType ) == 0 ) {
+ if ( flushConcatenation( concat ) ) {
+ hasOutputToken = false;
+ return true;
+ }
+ hasOutputToken = false;
+ }
+
+ // add subwords depending upon options
+ if ( shouldConcatenate( wordType ) ) {
+ if ( concat.isEmpty() ) {
+ concat.type = wordType;
+ }
+ concatenate( concat );
+ }
+
+ // add all subwords (catenateAll)
+ if ( catenateAll ) {
+ concatenate( concatAll );
+ }
+
+ // if we should output the word or number part
+ if ( shouldGenerateParts( wordType ) ) {
+ generatePart( false );
+ iterator.next();
+ return true;
+ }
+
+ iterator.next();
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ hasSavedState = false;
+ concat.clear();
+ concatAll.clear();
+ accumPosInc = 0;
+ }
+
+ // ================================================= Helper Methods
================================================
+
+ /**
+ * Saves the existing attribute states
+ */
+ private void saveState() {
+ // otherwise, we have delimiters, save state
+ savedStartOffset = offsetAttribute.startOffset();
+ savedEndOffset = offsetAttribute.endOffset();
+ // if length by start + end offsets doesn't match the term text then assume this is
a synonym and don't adjust the offsets.
+ hasIllegalOffsets = ( savedEndOffset - savedStartOffset != termAttribute.termLength()
);
+ savedType = typeAttribute.type();
+
+ if ( savedBuffer.length < termAttribute.termLength() ) {
+ // TODO - replace with ArrayUtil.oversize when upgrading to Lucene 3.1
+ savedBuffer = new char[oversize( termAttribute.termLength(), 2 )];
//RamUsageEstimator.NUM_BYTES_CHAR
+ }
+
+ System.arraycopy( termAttribute.termBuffer(), 0, savedBuffer, 0,
termAttribute.termLength() );
+ iterator.text = savedBuffer;
+
+ hasSavedState = true;
+ }
+
+ /**
+ * Flushes the given WordDelimiterConcatenation by either writing its concat and then
clearing, or just clearing.
+ *
+ * @param concatenation WordDelimiterConcatenation that will be flushed
+ *
+ * @return {@code true} if the concatenation was written before it was cleared, {@code}
false otherwise
+ */
+ private boolean flushConcatenation(WordDelimiterConcatenation concatenation) {
+ lastConcatCount = concatenation.subwordCount;
+ if ( concatenation.subwordCount != 1 || !shouldGenerateParts( concatenation.type ) ) {
+ concatenation.writeAndClear();
+ return true;
+ }
+ concatenation.clear();
+ return false;
+ }
+
+ /**
+ * Determines whether to concatenate a word or number if the current word is the given
type
+ *
+ * @param wordType Type of the current word used to determine if it should be
concatenated
+ *
+ * @return {@code true} if concatenation should occur, {@code false} otherwise
+ */
+ private boolean shouldConcatenate(int wordType) {
+ return ( catenateWords && isAlpha( wordType ) ) || ( catenateNumbers &&
isDigit( wordType ) );
+ }
+
+ /**
+ * Determines whether a word/number part should be generated for a word of the given
type
+ *
+ * @param wordType Type of the word used to determine if a word/number part should be
generated
+ *
+ * @return {@code true} if a word/number part should be generated, {@code false}
otherwise
+ */
+ private boolean shouldGenerateParts(int wordType) {
+ return ( generateWordParts && isAlpha( wordType ) ) || ( generateNumberParts
&& isDigit( wordType ) );
+ }
+
+ /**
+ * Concatenates the saved buffer to the given WordDelimiterConcatenation
+ *
+ * @param concatenation WordDelimiterConcatenation to concatenate the buffer to
+ */
+ private void concatenate(WordDelimiterConcatenation concatenation) {
+ if ( concatenation.isEmpty() ) {
+ concatenation.startOffset = savedStartOffset + iterator.current;
+ }
+ concatenation.append( savedBuffer, iterator.current, iterator.end - iterator.current
);
+ concatenation.endOffset = savedStartOffset + iterator.end;
+ }
+
+ /**
+ * Generates a word/number part, updating the appropriate attributes
+ *
+ * @param isSingleWord {@code true} if the generation is occurring from a single word,
{@code false} otherwise
+ */
+ private void generatePart(boolean isSingleWord) {
+ clearAttributes();
+ termAttribute.setTermBuffer( savedBuffer, iterator.current, iterator.end -
iterator.current );
+
+ int startOffSet = ( isSingleWord || !hasIllegalOffsets ) ? savedStartOffset +
iterator.current : savedStartOffset;
+ int endOffSet = ( hasIllegalOffsets ) ? savedEndOffset : savedStartOffset +
iterator.end;
+
+ offsetAttribute.setOffset( startOffSet, endOffSet );
+ posIncAttribute.setPositionIncrement( position( false ) );
+ typeAttribute.setType( savedType );
+ }
+
+ /**
+ * Get the position increment gap for a subword or concatenation
+ *
+ * @param inject true if this token wants to be injected
+ *
+ * @return position increment gap
+ */
+ private int position(boolean inject) {
+ int posInc = accumPosInc;
+
+ if ( hasOutputToken ) {
+ accumPosInc = 0;
+ return inject ? 0 : Math.max( 1, posInc );
+ }
+
+ hasOutputToken = true;
+
+ if ( !hasOutputFollowingOriginal ) {
+ // the first token following the original is 0 regardless
+ hasOutputFollowingOriginal = true;
+ return 0;
+ }
+ // clear the accumulated position increment
+ accumPosInc = 0;
+ return Math.max( 1, posInc );
+ }
+
+ /**
+ * Checks if the given word type includes {@link #ALPHA}
+ *
+ * @param type Word type to check
+ *
+ * @return {@code true} if the type contains ALPHA, {@code false} otherwise
+ */
+ static boolean isAlpha(int type) {
+ return ( type & ALPHA ) != 0;
+ }
+
+ /**
+ * Checks if the given word type includes {@link #DIGIT}
+ *
+ * @param type Word type to check
+ *
+ * @return {@code true} if the type contains DIGIT, {@code false} otherwise
+ */
+ static boolean isDigit(int type) {
+ return ( type & DIGIT ) != 0;
+ }
+
+ /**
+ * Checks if the given word type includes {@link #SUBWORD_DELIM}
+ *
+ * @param type Word type to check
+ *
+ * @return {@code true} if the type contains SUBWORD_DELIM, {@code false} otherwise
+ */
+ static boolean isSubwordDelim(int type) {
+ return ( type & SUBWORD_DELIM ) != 0;
+ }
+
+ /**
+ * Checks if the given word type includes {@link #UPPER}
+ *
+ * @param type Word type to check
+ *
+ * @return {@code true} if the type contains UPPER, {@code false} otherwise
+ */
+ static boolean isUpper(int type) {
+ return ( type & UPPER ) != 0;
+ }
+
+ /**
+ * Returns an array size >= minTargetSize, generally
+ * over-allocating exponentially to achieve amortized
+ * linear-time cost as the array grows.
+ * <p/>
+ * NOTE: this was originally borrowed from Python 2.4.2
+ * listobject.c sources (attribution in LICENSE.txt), but
+ * has now been substantially changed based on
+ * discussions from java-dev thread with subject "Dynamic
+ * array reallocation algorithms", started on Jan 12
+ * 2010.
+ *
+ * @param minTargetSize Minimum required value to be returned.
+ * @param bytesPerElement Bytes used by each element of
+ * the array. See constants in {@link RamUsageEstimator}.
+ * @todo Remove this code as soon as we we upgrade to Lucene 3.1 where
org.apache.lucene.util.ArrayUtil contains
+ * the oversize method
+ *
+ */
+ public static int oversize(int minTargetSize, int bytesPerElement) {
+
+ if ( minTargetSize < 0 ) {
+ // catch usage that accidentally overflows int
+ throw new IllegalArgumentException( "invalid array size " + minTargetSize
);
+ }
+
+ if ( minTargetSize == 0 ) {
+ // wait until at least one element is requested
+ return 0;
+ }
+
+ // asymptotic exponential growth by 1/8th, favors
+ // spending a bit more CPU to not tie up too much wasted
+ // RAM:
+ int extra = minTargetSize >> 3;
+
+ if ( extra < 3 ) {
+ // for very small arrays, where constant overhead of
+ // realloc is presumably relatively high, we grow
+ // faster
+ extra = 3;
+ }
+
+ int newSize = minTargetSize + extra;
+
+ // add 7 to allow for worst case byte alignment addition below:
+ if ( newSize + 7 < 0 ) {
+ // int overflowed -- return max allowed array size
+ return Integer.MAX_VALUE;
+ }
+
+ if ( JRE_IS_64BIT ) {
+ // round up to 8 byte alignment in 64bit env
+ switch ( bytesPerElement ) {
+ case 4:
+ // round up to multiple of 2
+ return ( newSize + 1 ) & 0x7ffffffe;
+ case 2:
+ // round up to multiple of 4
+ return ( newSize + 3 ) & 0x7ffffffc;
+ case 1:
+ // round up to multiple of 8
+ return ( newSize + 7 ) & 0x7ffffff8;
+ case 8:
+ // no rounding
+ default:
+ // odd (invalid?) size
+ return newSize;
+ }
+ }
+ else {
+ // round up to 4 byte alignment in 64bit env
+ switch ( bytesPerElement ) {
+ case 2:
+ // round up to multiple of 2
+ return ( newSize + 1 ) & 0x7ffffffe;
+ case 1:
+ // round up to multiple of 4
+ return ( newSize + 3 ) & 0x7ffffffc;
+ case 4:
+ case 8:
+ // no rounding
+ default:
+ // odd (invalid?) size
+ return newSize;
+ }
+ }
+ }
+
+ // ================================================= Inner Classes
=================================================
+
+ /**
+ * A WDF concatenated 'run'
+ */
+ final class WordDelimiterConcatenation {
+ final StringBuilder buffer = new StringBuilder();
+ int startOffset;
+ int endOffset;
+ int type;
+ int subwordCount;
+
+ /**
+ * Appends the given text of the given length, to the concetenation at the given
offset
+ *
+ * @param text Text to append
+ * @param offset Offset in the concetenation to add the text
+ * @param length Length of the text to append
+ */
+ void append(char text[], int offset, int length) {
+ buffer.append( text, offset, length );
+ subwordCount++;
+ }
+
+ /**
+ * Writes the concatenation to the attributes
+ */
+ void write() {
+ clearAttributes();
+ if ( termAttribute.termLength() < buffer.length() ) {
+ termAttribute.resizeTermBuffer( buffer.length() );
+ }
+ char termbuffer[] = termAttribute.termBuffer();
+
+ buffer.getChars( 0, buffer.length(), termbuffer, 0 );
+ termAttribute.setTermLength( buffer.length() );
+
+ if ( hasIllegalOffsets ) {
+ offsetAttribute.setOffset( savedStartOffset, savedEndOffset );
+ }
+ else {
+ offsetAttribute.setOffset( startOffset, endOffset );
+ }
+ posIncAttribute.setPositionIncrement( position( true ) );
+ typeAttribute.setType( savedType );
+ accumPosInc = 0;
+ }
+
+ /**
+ * Determines if the concatenation is empty
+ *
+ * @return {@code true} if the concatenation is empty, {@code false} otherwise
+ */
+ boolean isEmpty() {
+ return buffer.length() == 0;
+ }
+
+ /**
+ * Clears the concatenation and resets its state
+ */
+ void clear() {
+ buffer.setLength( 0 );
+ startOffset = endOffset = type = subwordCount = 0;
+ }
+
+ /**
+ * Convenience method for the common scenario of having to write the concetenation and
then clearing its state
+ */
+ void writeAndClear() {
+ write();
+ clear();
+ }
+ }
+ // questions:
+ // negative numbers? -42 indexed as just 42?
+ // dollar sign? $42
+ // percent sign? 33%
+ // downsides: if source text is "powershot" then a query of
"PowerShot" won't match!
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+
+/**
+ * @version $Id: WordDelimiterFilterFactory.java 990456 2010-08-28 22:01:10Z rmuir $
+ */
+public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements
ResourceLoaderAware {
+ public static final String PROTECTED_TOKENS = "protected";
+ public static final String TYPES = "types";
+
+ public void inform(ResourceLoader loader) {
+ String wordFiles = args.get( PROTECTED_TOKENS );
+ if ( wordFiles != null ) {
+ try {
+ protectedWords = getWordSet( loader, wordFiles, false );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ String types = args.get( TYPES );
+ if ( types != null ) {
+ try {
+ List<String> files = StrUtils.splitFileNames( types );
+ List<String> wlist = new ArrayList<String>();
+ for ( String file : files ) {
+ List<String> lines = loader.getLines( file.trim() );
+ wlist.addAll( lines );
+ }
+ typeTable = parseTypes( wlist );
+ }
+ catch ( IOException e ) {
+ throw new RuntimeException( e );
+ }
+ }
+ }
+
+ private CharArraySet protectedWords = null;
+
+ int generateWordParts = 0;
+ int generateNumberParts = 0;
+ int catenateWords = 0;
+ int catenateNumbers = 0;
+ int catenateAll = 0;
+ int splitOnCaseChange = 0;
+ int splitOnNumerics = 0;
+ int preserveOriginal = 0;
+ int stemEnglishPossessive = 0;
+ byte[] typeTable = null;
+
+ @Override
+ public void init(Map<String, String> args) {
+ super.init( args );
+ generateWordParts = getInt( "generateWordParts", 1 );
+ generateNumberParts = getInt( "generateNumberParts", 1 );
+ catenateWords = getInt( "catenateWords", 0 );
+ catenateNumbers = getInt( "catenateNumbers", 0 );
+ catenateAll = getInt( "catenateAll", 0 );
+ splitOnCaseChange = getInt( "splitOnCaseChange", 1 );
+ splitOnNumerics = getInt( "splitOnNumerics", 1 );
+ preserveOriginal = getInt( "preserveOriginal", 0 );
+ stemEnglishPossessive = getInt( "stemEnglishPossessive", 1 );
+ }
+
+ public WordDelimiterFilter create(TokenStream input) {
+ return new WordDelimiterFilter(
+ input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE :
typeTable,
+ generateWordParts, generateNumberParts,
+ catenateWords, catenateNumbers, catenateAll,
+ splitOnCaseChange, preserveOriginal,
+ splitOnNumerics, stemEnglishPossessive, protectedWords
+ );
+ }
+
+ // source => type
+ private static Pattern typePattern = Pattern.compile(
"(.*)\\s*=>\\s*(.*)\\s*$" );
+
+ /**
+ * parses a list of MappingCharFilter style rules into a custom byte[] type table
+ */
+ private byte[] parseTypes(List<String> rules) {
+ SortedMap<Character, Byte> typeMap = new TreeMap<Character, Byte>();
+ for ( String rule : rules ) {
+ Matcher m = typePattern.matcher( rule );
+ if ( !m.find() ) {
+ throw new RuntimeException( "Invalid Mapping Rule : [" + rule +
"]" );
+ }
+ String lhs = parseString( m.group( 1 ).trim() );
+ Byte rhs = parseType( m.group( 2 ).trim() );
+ if ( lhs.length() != 1 ) {
+ throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "].
Only a single character is allowed." );
+ }
+ if ( rhs == null ) {
+ throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "].
Illegal type." );
+ }
+ typeMap.put( lhs.charAt( 0 ), rhs );
+ }
+
+ // ensure the table is always at least as big as DEFAULT_WORD_DELIM_TABLE for
performance
+ byte types[] = new byte[Math.max(
+ typeMap.lastKey() + 1, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE.length
+ )];
+ for ( int i = 0; i < types.length; i++ ) {
+ types[i] = WordDelimiterIterator.getType( i );
+ }
+ for ( Map.Entry<Character, Byte> mapping : typeMap.entrySet() ) {
+ types[mapping.getKey()] = mapping.getValue();
+ }
+ return types;
+ }
+
+ private Byte parseType(String s) {
+ if ( s.equals( "LOWER" ) ) {
+ return WordDelimiterFilter.LOWER;
+ }
+ else if ( s.equals( "UPPER" ) ) {
+ return WordDelimiterFilter.UPPER;
+ }
+ else if ( s.equals( "ALPHA" ) ) {
+ return WordDelimiterFilter.ALPHA;
+ }
+ else if ( s.equals( "DIGIT" ) ) {
+ return WordDelimiterFilter.DIGIT;
+ }
+ else if ( s.equals( "ALPHANUM" ) ) {
+ return WordDelimiterFilter.ALPHANUM;
+ }
+ else if ( s.equals( "SUBWORD_DELIM" ) ) {
+ return WordDelimiterFilter.SUBWORD_DELIM;
+ }
+ else {
+ return null;
+ }
+ }
+
+ char[] out = new char[256];
+
+ private String parseString(String s) {
+ int readPos = 0;
+ int len = s.length();
+ int writePos = 0;
+ while ( readPos < len ) {
+ char c = s.charAt( readPos++ );
+ if ( c == '\\' ) {
+ if ( readPos >= len ) {
+ throw new RuntimeException( "Invalid escaped char in [" + s +
"]" );
+ }
+ c = s.charAt( readPos++ );
+ switch ( c ) {
+ case '\\':
+ c = '\\';
+ break;
+ case 'n':
+ c = '\n';
+ break;
+ case 't':
+ c = '\t';
+ break;
+ case 'r':
+ c = '\r';
+ break;
+ case 'b':
+ c = '\b';
+ break;
+ case 'f':
+ c = '\f';
+ break;
+ case 'u':
+ if ( readPos + 3 >= len ) {
+ throw new RuntimeException( "Invalid escaped char in [" + s +
"]" );
+ }
+ c = ( char ) Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+ readPos += 4;
+ break;
+ }
+ }
+ out[writePos++] = c;
+ }
+ return new String( out, 0, writePos );
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterIterator.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterIterator.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/analysis/WordDelimiterIterator.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,359 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import static org.apache.solr.analysis.WordDelimiterFilter.ALPHA;
+import static org.apache.solr.analysis.WordDelimiterFilter.DIGIT;
+import static org.apache.solr.analysis.WordDelimiterFilter.LOWER;
+import static org.apache.solr.analysis.WordDelimiterFilter.SUBWORD_DELIM;
+import static org.apache.solr.analysis.WordDelimiterFilter.UPPER;
+import static org.apache.solr.analysis.WordDelimiterFilter.isAlpha;
+import static org.apache.solr.analysis.WordDelimiterFilter.isDigit;
+import static org.apache.solr.analysis.WordDelimiterFilter.isSubwordDelim;
+import static org.apache.solr.analysis.WordDelimiterFilter.isUpper;
+
+/**
+ * A BreakIterator-like API for iterating over subwords in text, according to
WordDelimiterFilter rules.
+ *
+ * @lucene.internal
+ */
+public final class WordDelimiterIterator {
+
+ /**
+ * Indicates the end of iteration
+ */
+ public static final int DONE = -1;
+
+ public static final byte[] DEFAULT_WORD_DELIM_TABLE;
+
+ char text[];
+ int length;
+
+ /**
+ * start position of text, excluding leading delimiters
+ */
+ int startBounds;
+ /**
+ * end position of text, excluding trailing delimiters
+ */
+ int endBounds;
+
+ /**
+ * Beginning of subword
+ */
+ int current;
+ /**
+ * End of subword
+ */
+ int end;
+
+ /* does this string end with a possessive such as 's */
+ private boolean hasFinalPossessive = false;
+
+ /**
+ * If false, causes case changes to be ignored (subwords will only be generated
+ * given SUBWORD_DELIM tokens). (Defaults to true)
+ */
+ final boolean splitOnCaseChange;
+
+ /**
+ * If false, causes numeric changes to be ignored (subwords will only be generated
+ * given SUBWORD_DELIM tokens). (Defaults to true)
+ */
+ final boolean splitOnNumerics;
+
+ /**
+ * If true, causes trailing "'s" to be removed for each subword. (Defaults
to true)
+ * <p/>
+ * "O'Neil's" => "O", "Neil"
+ */
+ final boolean stemEnglishPossessive;
+
+ private final byte[] charTypeTable;
+
+ /**
+ * if true, need to skip over a possessive found in the last call to next()
+ */
+ private boolean skipPossessive = false;
+
+ // TODO: should there be a WORD_DELIM category for chars that only separate words (no
catenation of subwords will be
+ // done if separated by these chars?) "," would be an obvious candidate...
+
+ static {
+ byte[] tab = new byte[256];
+ for ( int i = 0; i < 256; i++ ) {
+ byte code = 0;
+ if ( Character.isLowerCase( i ) ) {
+ code |= LOWER;
+ }
+ else if ( Character.isUpperCase( i ) ) {
+ code |= UPPER;
+ }
+ else if ( Character.isDigit( i ) ) {
+ code |= DIGIT;
+ }
+ if ( code == 0 ) {
+ code = SUBWORD_DELIM;
+ }
+ tab[i] = code;
+ }
+ DEFAULT_WORD_DELIM_TABLE = tab;
+ }
+
+ /**
+ * Create a new WordDelimiterIterator operating with the supplied rules.
+ *
+ * @param charTypeTable table containing character types
+ * @param splitOnCaseChange if true, causes "PowerShot" to be two tokens;
("Power-Shot" remains two parts regards)
+ * @param splitOnNumerics if true, causes "j2se" to be three tokens;
"j" "2" "se"
+ * @param stemEnglishPossessive if true, causes trailing "'s" to be
removed for each subword: "O'Neil's" => "O",
"Neil"
+ */
+ WordDelimiterIterator(byte[] charTypeTable, boolean splitOnCaseChange, boolean
splitOnNumerics, boolean stemEnglishPossessive) {
+ this.charTypeTable = charTypeTable;
+ this.splitOnCaseChange = splitOnCaseChange;
+ this.splitOnNumerics = splitOnNumerics;
+ this.stemEnglishPossessive = stemEnglishPossessive;
+ }
+
+ /**
+ * Advance to the next subword in the string.
+ *
+ * @return index of the next subword, or {@link #DONE} if all subwords have been
returned
+ */
+ int next() {
+ current = end;
+ if ( current == DONE ) {
+ return DONE;
+ }
+
+ if ( skipPossessive ) {
+ current += 2;
+ skipPossessive = false;
+ }
+
+ int lastType = 0;
+
+ while ( current < endBounds && ( isSubwordDelim( lastType = charType(
text[current] ) ) ) ) {
+ current++;
+ }
+
+ if ( current >= endBounds ) {
+ return end = DONE;
+ }
+
+ for ( end = current + 1; end < endBounds; end++ ) {
+ int type = charType( text[end] );
+ if ( isBreak( lastType, type ) ) {
+ break;
+ }
+ lastType = type;
+ }
+
+ if ( end < endBounds - 1 && endsWithPossessive( end + 2 ) ) {
+ skipPossessive = true;
+ }
+
+ return end;
+ }
+
+
+ /**
+ * Return the type of the current subword.
+ * This currently uses the type of the first character in the subword.
+ *
+ * @return type of the current word
+ */
+ int type() {
+ if ( end == DONE ) {
+ return 0;
+ }
+
+ int type = charType( text[current] );
+ switch ( type ) {
+ // return ALPHA word type for both lower and upper
+ case LOWER:
+ case UPPER:
+ return ALPHA;
+ default:
+ return type;
+ }
+ }
+
+ /**
+ * Reset the text to a new value, and reset all state
+ *
+ * @param text New text
+ * @param length length of the text
+ */
+ void setText(char text[], int length) {
+ this.text = text;
+ this.length = this.endBounds = length;
+ current = startBounds = end = 0;
+ skipPossessive = hasFinalPossessive = false;
+ setBounds();
+ }
+
+ // ================================================= Helper Methods
================================================
+
+ /**
+ * Determines whether the transition from lastType to type indicates a break
+ *
+ * @param lastType Last subword type
+ * @param type Current subword type
+ *
+ * @return {@code true} if the transition indicates a break, {@code false} otherwise
+ */
+ private boolean isBreak(int lastType, int type) {
+ if ( ( type & lastType ) != 0 ) {
+ return false;
+ }
+
+ if ( !splitOnCaseChange && isAlpha( lastType ) && isAlpha( type ) ) {
+ // ALPHA->ALPHA: always ignore if case isn't considered.
+ return false;
+ }
+ else if ( isUpper( lastType ) && isAlpha( type ) ) {
+ // UPPER->letter: Don't split
+ return false;
+ }
+ else if ( !splitOnNumerics && ( ( isAlpha( lastType ) && isDigit( type
) ) || ( isDigit( lastType ) && isAlpha(
+ type
+ ) ) ) ) {
+ // ALPHA->NUMERIC, NUMERIC->ALPHA :Don't split
+ return false;
+ }
+
+ return true;
+ }
+
+ /**
+ * Determines if the current word contains only one subword. Note, it could be
potentially surrounded by delimiters
+ *
+ * @return {@code true} if the current word contains only one subword, {@code false}
otherwise
+ */
+ boolean isSingleWord() {
+ if ( hasFinalPossessive ) {
+ return current == startBounds && end == endBounds - 2;
+ }
+ else {
+ return current == startBounds && end == endBounds;
+ }
+ }
+
+ /**
+ * Set the internal word bounds (remove leading and trailing delimiters). Note, if a
possessive is found, don't remove
+ * it yet, simply note it.
+ */
+ private void setBounds() {
+ while ( startBounds < length && ( isSubwordDelim( charType(
text[startBounds] ) ) ) ) {
+ startBounds++;
+ }
+
+ while ( endBounds > startBounds && ( isSubwordDelim( charType(
text[endBounds - 1] ) ) ) ) {
+ endBounds--;
+ }
+ if ( endsWithPossessive( endBounds ) ) {
+ hasFinalPossessive = true;
+ }
+ current = startBounds;
+ }
+
+ /**
+ * Determines if the text at the given position indicates an English possessive which
should be removed
+ *
+ * @param pos Position in the text to check if it indicates an English possessive
+ *
+ * @return {@code true} if the text at the position indicates an English posessive,
{@code false} otherwise
+ */
+ private boolean endsWithPossessive(int pos) {
+ return ( stemEnglishPossessive &&
+ pos > 2 &&
+ text[pos - 2] == '\'' &&
+ ( text[pos - 1] == 's' || text[pos - 1] == 'S' ) &&
+ isAlpha( charType( text[pos - 3] ) ) &&
+ ( pos == endBounds || isSubwordDelim( charType( text[pos] ) ) ) );
+ }
+
+ /**
+ * Determines the type of the given character
+ *
+ * @param ch Character whose type is to be determined
+ *
+ * @return Type of the character
+ */
+ private int charType(int ch) {
+ if ( ch < charTypeTable.length ) {
+ return charTypeTable[ch];
+ }
+ return getType( ch );
+ }
+
+ /**
+ * Computes the type of the given character
+ *
+ * @param ch Character whose type is to be determined
+ *
+ * @return Type of the character
+ */
+ public static byte getType(int ch) {
+ switch ( Character.getType( ch ) ) {
+ case Character.UPPERCASE_LETTER:
+ return UPPER;
+ case Character.LOWERCASE_LETTER:
+ return LOWER;
+
+ case Character.TITLECASE_LETTER:
+ case Character.MODIFIER_LETTER:
+ case Character.OTHER_LETTER:
+ case Character.NON_SPACING_MARK:
+ case Character.ENCLOSING_MARK: // depends what it encloses?
+ case Character.COMBINING_SPACING_MARK:
+ return ALPHA;
+
+ case Character.DECIMAL_DIGIT_NUMBER:
+ case Character.LETTER_NUMBER:
+ case Character.OTHER_NUMBER:
+ return DIGIT;
+
+ // case Character.SPACE_SEPARATOR:
+ // case Character.LINE_SEPARATOR:
+ // case Character.PARAGRAPH_SEPARATOR:
+ // case Character.CONTROL:
+ // case Character.FORMAT:
+ // case Character.PRIVATE_USE:
+
+ case Character.SURROGATE: // prevent splitting
+ return ALPHA | DIGIT;
+
+ // case Character.DASH_PUNCTUATION:
+ // case Character.START_PUNCTUATION:
+ // case Character.END_PUNCTUATION:
+ // case Character.CONNECTOR_PUNCTUATION:
+ // case Character.OTHER_PUNCTUATION:
+ // case Character.MATH_SYMBOL:
+ // case Character.CURRENCY_SYMBOL:
+ // case Character.MODIFIER_SYMBOL:
+ // case Character.OTHER_SYMBOL:
+ // case Character.INITIAL_QUOTE_PUNCTUATION:
+ // case Character.FINAL_QUOTE_PUNCTUATION:
+
+ default:
+ return SUBWORD_DELIM;
+ }
+ }
+}
\ No newline at end of file
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/ResourceLoader.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/ResourceLoader.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/ResourceLoader.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,47 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.common;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+/**
+ * @since solr 1.3
+ */
+public interface ResourceLoader
+{
+ public InputStream openResource(String resource) throws IOException;
+
+ /**
+ * Accesses a resource by name and returns the (non comment) lines
+ * containing data.
+ *
+ * <p>
+ * A comment line is any line that starts with the character "#"
+ * </p>
+ *
+ * @param resource
+ * @return a list of non-blank non-comment lines with whitespace trimmed
+ * from front and back.
+ * @throws IOException
+ */
+ public List<String> getLines(String resource) throws IOException;
+
+ public Object newInstance(String cname, String ... subpackages);
+}
\ No newline at end of file
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/SolrException.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/SolrException.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/SolrException.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,216 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.common;
+
+import org.slf4j.Logger;
+
+import java.io.CharArrayWriter;
+import java.io.PrintWriter;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * @version $Id: SolrException.java 926654 2010-03-23 16:41:25Z yonik $
+ */
+public class SolrException extends RuntimeException {
+
+ /**
+ * @since solr 1.2
+ */
+ public enum ErrorCode {
+ BAD_REQUEST( 400 ),
+ UNAUTHORIZED( 401 ),
+ FORBIDDEN( 403 ),
+ NOT_FOUND( 404 ),
+ SERVER_ERROR( 500 ),
+ SERVICE_UNAVAILABLE( 503 ),
+ UNKNOWN(0);
+ public final int code;
+
+ private ErrorCode( int c )
+ {
+ code = c;
+ }
+ public static ErrorCode getErrorCode(int c){
+ for (ErrorCode err : values()) {
+ if(err.code == c) return err;
+ }
+ return UNKNOWN;
+ }
+ };
+
+ public boolean logged=false;
+
+ public SolrException(ErrorCode code, String msg) {
+ super(msg);
+ this.code=code.code;
+ }
+
+ public SolrException(ErrorCode code, String msg, boolean alreadyLogged) {
+ super(msg);
+ this.code=code.code;
+ this.logged=alreadyLogged;
+ }
+
+ public SolrException(ErrorCode code, String msg, Throwable th, boolean alreadyLogged)
{
+ super(msg,th);
+ this.code=code.code;
+ logged=alreadyLogged;
+ }
+
+ public SolrException(ErrorCode code, String msg, Throwable th) {
+ this(code,msg,th,true);
+ }
+
+ public SolrException(ErrorCode code, Throwable th) {
+ super(th);
+ this.code=code.code;
+ logged=true;
+ }
+
+ /**
+ * @deprecated Use {@link #SolrException(ErrorCode,String)}.
+ */
+ @Deprecated
+ public SolrException(int code, String msg) {
+ super(msg);
+ this.code=code;
+ }
+
+ /**
+ * @deprecated Use {@link #SolrException(ErrorCode,String,boolean)}.
+ */
+ @Deprecated
+ public SolrException(int code, String msg, boolean alreadyLogged) {
+ super(msg);
+ this.code=code;
+ this.logged=alreadyLogged;
+ }
+
+ /**
+ * @deprecated Use {@link #SolrException(ErrorCode,String,Throwable,boolean)}.
+ */
+ @Deprecated
+ public SolrException(int code, String msg, Throwable th, boolean alreadyLogged) {
+ super(msg,th);
+ this.code=code;
+ logged=alreadyLogged;
+ }
+
+ /**
+ * @deprecated Use {@link #SolrException(ErrorCode,String,Throwable)}.
+ */
+ @Deprecated
+ public SolrException(int code, String msg, Throwable th) {
+ this(code,msg,th,true);
+ }
+
+ /**
+ * @deprecated Use {@link #SolrException(ErrorCode,Throwable)}.
+ */
+ @Deprecated
+ public SolrException(int code, Throwable th) {
+ super(th);
+ this.code=code;
+ logged=true;
+ }
+
+ int code=0;
+ public int code() { return code; }
+
+
+
+
+ public void log(Logger log) { log(log,this); }
+ public static void log(Logger log, Throwable e) {
+ if (e instanceof SolrException) {
+ ((SolrException)e).logged = true;
+ }
+ String stackTrace = toStr(e);
+ String ignore = doIgnore(stackTrace);
+ if (ignore != null) {
+ log.info(ignore);
+ return;
+ }
+ log.error(stackTrace);
+
+ }
+
+ public static void log(Logger log, String msg, Throwable e) {
+ if (e instanceof SolrException) {
+ ((SolrException)e).logged = true;
+ }
+ String stackTrace = msg + ':' + toStr(e);
+ String ignore = doIgnore(stackTrace);
+ if (ignore != null) {
+ log.info(ignore);
+ return;
+ }
+ log.error(stackTrace);
+ }
+
+ public static void logOnce(Logger log, String msg, Throwable e) {
+ if (e instanceof SolrException) {
+ if(((SolrException)e).logged) return;
+ }
+ if (msg!=null) log(log,msg,e);
+ else log(log,e);
+ }
+
+
+ // public String toString() { return toStr(this); } // oops, inf loop
+ @Override
+ public String toString() { return super.toString(); }
+
+ public static String toStr(Throwable e) {
+ CharArrayWriter cw = new CharArrayWriter();
+ PrintWriter pw = new PrintWriter(cw);
+ e.printStackTrace(pw);
+ pw.flush();
+ return cw.toString();
+
+/** This doesn't work for some reason!!!!!
+ StringWriter sw = new StringWriter();
+ PrintWriter pw = new PrintWriter(sw);
+ e.printStackTrace(pw);
+ pw.flush();
+ System.out.println("The STRING:" + sw.toString());
+ return sw.toString();
+**/
+ }
+
+
+ /** For test code - do not log exceptions that match any of the regular expressions in
ignorePatterns */
+ public static Set<String> ignorePatterns;
+
+ /** Returns null if this exception does not match any ignore patterns, or a message
string to use if it does. */
+ public static String doIgnore(String m) {
+ if (ignorePatterns == null || m == null) return null;
+
+ for (String regex : ignorePatterns) {
+ Pattern pattern = Pattern.compile(regex);
+ Matcher matcher = pattern.matcher(m);
+ if (matcher.find()) return "Ignoring exception matching " + regex;
+ }
+
+ return null;
+ }
+
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/util/StrUtils.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/util/StrUtils.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/common/util/StrUtils.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,276 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
+
+package org.apache.solr.common.util;
+
+import java.util.List;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Locale;
+import java.io.IOException;
+
+import org.apache.solr.common.SolrException;
+
+/**
+ * @version $Id: StrUtils.java 945270 2010-05-17 17:45:18Z rmuir $
+ */
+public class StrUtils {
+ public static final char[] HEX_DIGITS = { '0', '1', '2',
'3', '4', '5', '6',
+ '7', '8', '9', 'a', 'b', 'c',
'd', 'e', 'f' };
+
+ /**
+ * Split a string based on a separator, but don't split if it's inside
+ * a string. Assume '\' escapes the next char both inside and
+ * outside strings.
+ */
+ public static List<String> splitSmart(String s, char separator) {
+ ArrayList<String> lst = new ArrayList<String>(4);
+ int pos=0, start=0, end=s.length();
+ char inString=0;
+ char ch=0;
+ while (pos < end) {
+ char prevChar=ch;
+ ch = s.charAt(pos++);
+ if (ch=='\\') { // skip escaped chars
+ pos++;
+ } else if (inString != 0 && ch==inString) {
+ inString=0;
+ } else if (ch=='\'' || ch=='"') {
+ // If char is directly preceeded by a number or letter
+ // then don't treat it as the start of a string.
+ // Examples: 50" TV, or can't
+ if (!Character.isLetterOrDigit(prevChar)) {
+ inString=ch;
+ }
+ } else if (ch==separator && inString==0) {
+ lst.add(s.substring(start,pos-1));
+ start=pos;
+ }
+ }
+ if (start < end) {
+ lst.add(s.substring(start,end));
+ }
+
+ /***
+ if (SolrCore.log.isLoggable(Level.FINEST)) {
+ SolrCore.log.trace("splitCommand=" + lst);
+ }
+ ***/
+
+ return lst;
+ }
+
+ /** Splits a backslash escaped string on the separator.
+ * <p>
+ * Current backslash escaping supported:
+ * <br> \n \t \r \b \f are escaped the same as a Java String
+ * <br> Other characters following a backslash are produced verbatim (\c =>
c)
+ *
+ * @param s the string to split
+ * @param separator the separator to split on
+ * @param decode decode backslash escaping
+ */
+ public static List<String> splitSmart(String s, String separator, boolean decode)
{
+ ArrayList<String> lst = new ArrayList<String>(2);
+ StringBuilder sb = new StringBuilder();
+ int pos=0, end=s.length();
+ while (pos < end) {
+ if (s.startsWith(separator,pos)) {
+ if (sb.length() > 0) {
+ lst.add(sb.toString());
+ sb=new StringBuilder();
+ }
+ pos+=separator.length();
+ continue;
+ }
+
+ char ch = s.charAt(pos++);
+ if (ch=='\\') {
+ if (!decode) sb.append(ch);
+ if (pos>=end) break; // ERROR, or let it go?
+ ch = s.charAt(pos++);
+ if (decode) {
+ switch(ch) {
+ case 'n' : ch='\n'; break;
+ case 't' : ch='\t'; break;
+ case 'r' : ch='\r'; break;
+ case 'b' : ch='\b'; break;
+ case 'f' : ch='\f'; break;
+ }
+ }
+ }
+
+ sb.append(ch);
+ }
+
+ if (sb.length() > 0) {
+ lst.add(sb.toString());
+ }
+
+ return lst;
+ }
+
+ /**
+ * Splits file names separated by comma character.
+ * File names can contain comma characters escaped by backslash '\'
+ *
+ * @param fileNames the string containing file names
+ * @return a list of file names with the escaping backslashed removed
+ */
+ public static List<String> splitFileNames(String fileNames) {
+ if (fileNames == null)
+ return Collections.<String>emptyList();
+
+ List<String> result = new ArrayList<String>();
+ for (String file : fileNames.split("(?<!\\\\),")) {
+ result.add(file.replaceAll("\\\\(?=,)", ""));
+ }
+
+ return result;
+ }
+
+ /** Creates a backslash escaped string, joining all the items. */
+ public static String join(List<String> items, char separator) {
+ StringBuilder sb = new StringBuilder(items.size() << 3);
+ boolean first=true;
+ for (String item : items) {
+ if (first) {
+ first = false;
+ } else {
+ sb.append(separator);
+ }
+ for (int i=0; i<item.length(); i++) {
+ char ch = item.charAt(i);
+ if (ch=='\\' || ch == separator) {
+ sb.append('\\');
+ }
+ sb.append(ch);
+ }
+ }
+ return sb.toString();
+ }
+
+
+
+ public static List<String> splitWS(String s, boolean decode) {
+ ArrayList<String> lst = new ArrayList<String>(2);
+ StringBuilder sb = new StringBuilder();
+ int pos=0, end=s.length();
+ while (pos < end) {
+ char ch = s.charAt(pos++);
+ if (Character.isWhitespace(ch)) {
+ if (sb.length() > 0) {
+ lst.add(sb.toString());
+ sb=new StringBuilder();
+ }
+ continue;
+ }
+
+ if (ch=='\\') {
+ if (!decode) sb.append(ch);
+ if (pos>=end) break; // ERROR, or let it go?
+ ch = s.charAt(pos++);
+ if (decode) {
+ switch(ch) {
+ case 'n' : ch='\n'; break;
+ case 't' : ch='\t'; break;
+ case 'r' : ch='\r'; break;
+ case 'b' : ch='\b'; break;
+ case 'f' : ch='\f'; break;
+ }
+ }
+ }
+
+ sb.append(ch);
+ }
+
+ if (sb.length() > 0) {
+ lst.add(sb.toString());
+ }
+
+ return lst;
+ }
+
+ public static List<String> toLower(List<String> strings) {
+ ArrayList<String> ret = new ArrayList<String>(strings.size());
+ for (String str : strings) {
+ ret.add(str.toLowerCase(Locale.ENGLISH));
+ }
+ return ret;
+ }
+
+
+
+ /** Return if a string starts with '1', 't', or 'T'
+ * and return false otherwise.
+ */
+ public static boolean parseBoolean(String s) {
+ char ch = s.length()>0 ? s.charAt(0) : 0;
+ return (ch=='1' || ch=='t' || ch=='T');
+ }
+
+ /** how to transform a String into a boolean... more flexible than
+ * Boolean.parseBoolean() to enable easier integration with html forms.
+ */
+ public static boolean parseBool(String s) {
+ if( s != null ) {
+ if( s.startsWith("true") || s.startsWith("on") ||
s.startsWith("yes") ) {
+ return true;
+ }
+ if( s.startsWith("false") || s.startsWith("off") ||
s.equals("no") ) {
+ return false;
+ }
+ }
+ throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "invalid boolean
value: "+s );
+ }
+
+ /**
+ * URLEncodes a value, replacing only enough chars so that
+ * the URL may be unambiguously pasted back into a browser.
+ * <p>
+ * Characters with a numeric value less than 32 are encoded.
+ * &,=,%,+,space are encoded.
+ * <p>
+ */
+ public static void partialURLEncodeVal(Appendable dest, String val) throws IOException
{
+ for (int i=0; i<val.length(); i++) {
+ char ch = val.charAt(i);
+ if (ch < 32) {
+ dest.append('%');
+ if (ch < 0x10) dest.append('0');
+ dest.append(Integer.toHexString(ch));
+ } else {
+ switch (ch) {
+ case ' ': dest.append('+'); break;
+ case '&': dest.append("%26"); break;
+ case '%': dest.append("%25"); break;
+ case '=': dest.append("%3D"); break;
+ case '+': dest.append("%2B"); break;
+ default : dest.append(ch); break;
+ }
+ }
+ }
+ }
+
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/ArraysUtils.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/ArraysUtils.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/ArraysUtils.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,51 @@
+package org.apache.solr.util;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/**
+ *
+ *
+ **/
+//Since Arrays.equals doesn't implement offsets for equals
+public class ArraysUtils {
+
+ /**
+ * See if two array slices are the same.
+ *
+ * @param left The left array to compare
+ * @param offsetLeft The offset into the array. Must be positive
+ * @param right The right array to compare
+ * @param offsetRight the offset into the right array. Must be positive
+ * @param length The length of the section of the array to compare
+ * @return true if the two arrays, starting at their respective offsets, are equal
+ *
+ * @see java.util.Arrays#equals(char[], char[])
+ */
+ public static boolean equals(char[] left, int offsetLeft, char[] right, int
offsetRight, int length) {
+ if ((offsetLeft + length <= left.length) && (offsetRight + length <=
right.length)) {
+ for (int i = 0; i < length; i++) {
+ if (left[offsetLeft + i] != right[offsetRight + i]) {
+ return false;
+ }
+
+ }
+ return true;
+ }
+ return false;
+ }
+}
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/CharArrayMap.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/CharArrayMap.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/CharArrayMap.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,418 @@
+/*
+ * Hibernate, Relational Persistence for Idiomatic Java
+ *
+ * Copyright (c) 2010, Red Hat, Inc. and/or its affiliates or third-party contributors
as
+ * indicated by the @author tags or express copyright attribution
+ * statements applied by the authors. All third-party contributions are
+ * distributed under license by Red Hat, Inc.
+ *
+ * This copyrighted material is made available to anyone wishing to use, modify,
+ * copy, or redistribute it subject to the terms and conditions of the GNU
+ * Lesser General Public License, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this distribution; if not, write to:
+ * Free Software Foundation, Inc.
+ * 51 Franklin Street, Fifth Floor
+ * Boston, MA 02110-1301 USA
+ */
+
+package org.apache.solr.util;
+
+import java.util.*;
+import java.io.Serializable;
+
+/**
+ * A simple class that stores key Strings as char[]'s in a
+ * hash table. Note that this is not a general purpose
+ * class. For example, it cannot remove items from the
+ * map, nor does it resize its hash table to be smaller,
+ * etc. It is designed to be quick to retrieve items
+ * by char[] keys without the necessity of converting
+ * to a String first.
+ */
+
+public class CharArrayMap<V> extends AbstractMap<String, V>
+ implements Map<String, V>, Cloneable, Serializable
+{
+ private final static int INIT_SIZE = 2;
+ private char[][] keys;
+ private Object[] values;
+ private int count;
+ private final boolean ignoreCase;
+
+ /** Create map with enough capacity to hold startSize
+ * terms */
+ public CharArrayMap(int initialCapacity, boolean ignoreCase) {
+ this.ignoreCase = ignoreCase;
+ int size = INIT_SIZE;
+ // load factor of .75, inverse is 1.25, or x+x/4
+ initialCapacity = initialCapacity + (initialCapacity >>2);
+ while(size <= initialCapacity)
+ size <<= 1;
+ keys = new char[size][];
+ values = new Object[size];
+ }
+
+ public boolean ignoreCase() {
+ return ignoreCase;
+ }
+
+ public V get(char[] key) {
+ return get(key, 0, key.length);
+ }
+
+ public V get(char[] key, int off, int len) {
+ return (V)values[getSlot(key, off, len)];
+ }
+
+ public V get(CharSequence key) {
+ return (V)values[getSlot(key)];
+ }
+
+ @Override
+ public V get(Object key) {
+ return (V)values[getSlot(key)];
+ }
+
+ @Override
+ public boolean containsKey(Object s) {
+ return keys[getSlot(s)] != null;
+ }
+
+ @Override
+ public boolean containsValue(Object value) {
+ if (value == null) {
+ // search for key with a null value
+ for (int i=0; i<keys.length; i++) {
+ if (keys[i] != null && values[i] == null) return true;
+ }
+ return false;
+ }
+
+ for (int i=0; i<values.length; i++) {
+ Object val = values[i];
+ if (val != null && value.equals(val)) return true;
+ }
+ return false;
+ }
+
+
+ private int getSlot(Object key) {
+ if (key instanceof char[]) {
+ char[] keyc = (char[])key;
+ return getSlot(keyc, 0, keyc.length);
+ }
+ return getSlot((CharSequence)key);
+ }
+
+ private int getSlot(char[] key, int off, int len) {
+ int code = getHashCode(key, len);
+ int pos = code & (keys.length-1);
+ char[] key2 = keys[pos];
+ if (key2 != null && !equals(key, off, len, key2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ key2 = keys[pos];
+ } while (key2 != null && !equals(key, off, len, key2));
+ }
+ return pos;
+ }
+
+ /** Returns true if the String is in the set */
+ private int getSlot(CharSequence key) {
+ int code = getHashCode(key);
+ int pos = code & (keys.length-1);
+ char[] key2 = keys[pos];
+ if (key2 != null && !equals(key, key2)) {
+ final int inc = ((code>>8)+code)|1;
+ do {
+ code += inc;
+ pos = code & (keys.length-1);
+ key2 = keys[pos];
+ } while (key2 != null && !equals(key, key2));
+ }
+ return pos;
+ }
+
+ public V put(CharSequence key, V val) {
+ return put(key.toString(), val); // could be more efficient
+ }
+
+ @Override
+ public V put(String key, V val) {
+ return put(key.toCharArray(), val);
+ }
+
+ /** Add this key,val pair to the map.
+ * The char[] key is directly used, no copy is made.
+ * If ignoreCase is true for this Map, the key array will be directly modified.
+ * The user should never modify the key after calling this method.
+ */
+ public V put(char[] key, Object val) {
+ if (ignoreCase)
+ for(int i=0;i< key.length;i++)
+ key[i] = Character.toLowerCase(key[i]);
+ int slot = getSlot(key, 0, key.length);
+ if (keys[slot] == null) count++;
+ Object prev = values[slot];
+ keys[slot] = key;
+ values[slot] = val;
+
+ if (count + (count>>2) >= keys.length) {
+ rehash();
+ }
+
+ return (V)prev;
+ }
+
+
+ private boolean equals(char[] text1, int off, int len, char[] text2) {
+ if (len != text2.length)
+ return false;
+ if (ignoreCase) {
+ for(int i=0;i<len;i++) {
+ if (Character.toLowerCase(text1[off+i]) != text2[i])
+ return false;
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1[off+i] != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private boolean equals(CharSequence text1, char[] text2) {
+ int len = text1.length();
+ if (len != text2.length)
+ return false;
+ if (ignoreCase) {
+ for(int i=0;i<len;i++) {
+ if (Character.toLowerCase(text1.charAt(i)) != text2[i])
+ return false;
+ }
+ } else {
+ for(int i=0;i<len;i++) {
+ if (text1.charAt(i) != text2[i])
+ return false;
+ }
+ }
+ return true;
+ }
+
+ private void rehash() {
+ final int newSize = 2* keys.length;
+ char[][] oldEntries = keys;
+ Object[] oldValues = values;
+ keys = new char[newSize][];
+ values = new Object[newSize];
+
+ for(int i=0;i<oldEntries.length;i++) {
+ char[] key = oldEntries[i];
+ if (key != null) {
+ // todo: could be faster... no need to compare keys on collision
+ // since they are unique
+ int newSlot = getSlot(key,0,key.length);
+ keys[newSlot] = key;
+ values[newSlot] = oldValues[i];
+ }
+ }
+ }
+
+ private int getHashCode(char[] text, int len) {
+ int code = 0;
+ if (ignoreCase) {
+ for (int i=0; i<len; i++) {
+ code = code*31 + Character.toLowerCase(text[i]);
+ }
+ } else {
+ for (int i=0; i<len; i++) {
+ code = code*31 + text[i];
+ }
+ }
+ return code;
+ }
+
+ private int getHashCode(CharSequence text) {
+ int code;
+ if (ignoreCase) {
+ code = 0;
+ int len = text.length();
+ for (int i=0; i<len; i++) {
+ code = code*31 + Character.toLowerCase(text.charAt(i));
+ }
+ } else {
+ if (false && text instanceof String) {
+ code = text.hashCode();
+ } else {
+ code = 0;
+ int len = text.length();
+ for (int i=0; i<len; i++) {
+ code = code*31 + text.charAt(i);
+ }
+ }
+ }
+ return code;
+ }
+
+ @Override
+ public int size() {
+ return count;
+ }
+
+ @Override
+ public boolean isEmpty() {
+ return count==0;
+ }
+
+ @Override
+ public void clear() {
+ count = 0;
+ Arrays.fill(keys,null);
+ Arrays.fill(values,null);
+ }
+
+ @Override
+ public Set<Entry<String, V>> entrySet() {
+ return new EntrySet();
+ }
+
+ /** Returns an EntryIterator over this Map. */
+ public EntryIterator iterator() {
+ return new EntryIterator();
+ }
+
+ /** public iterator class so efficient methods are exposed to users */
+ public class EntryIterator implements Iterator<Map.Entry<String,V>> {
+ int pos=-1;
+ int lastPos;
+
+ EntryIterator() {
+ goNext();
+ }
+
+ private void goNext() {
+ lastPos = pos;
+ pos++;
+ while (pos < keys.length && keys[pos] == null) pos++;
+ }
+
+ public boolean hasNext() {
+ return pos < keys.length;
+ }
+
+ /** gets the next key... do not modify the returned char[] */
+ public char[] nextKey() {
+ goNext();
+ return keys[lastPos];
+ }
+
+ /** gets the next key as a newly created String object */
+ public String nextKeyString() {
+ return new String(nextKey());
+ }
+
+ /** returns the value associated with the last key returned */
+ public V currentValue() {
+ return (V)values[lastPos];
+ }
+
+ /** sets the value associated with the last key returned */
+ public V setValue(V value) {
+ V old = (V)values[lastPos];
+ values[lastPos] = value;
+ return old;
+ }
+
+ /** Returns an Entry<String,V> object created on the fly...
+ * use nextCharArray() + currentValie() for better efficiency. */
+ public Map.Entry<String,V> next() {
+ goNext();
+ return new MapEntry(lastPos);
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+
+ private class MapEntry implements Map.Entry<String,V> {
+ final int pos;
+
+ MapEntry(int pos) {
+ this.pos = pos;
+ }
+
+ public char[] getCharArr() {
+ return keys[pos];
+ }
+
+ public String getKey() {
+ return new String(getCharArr());
+ }
+
+ public V getValue() {
+ return (V)values[pos];
+ }
+
+ public V setValue(V value) {
+ V old = (V)values[pos];
+ values[pos] = value;
+ return old;
+ }
+
+ public String toString() {
+ return getKey() + '=' + getValue();
+ }
+ }
+
+
+
+ private class EntrySet extends AbstractSet<Map.Entry<String, V>> {
+ public EntryIterator iterator() {
+ return new EntryIterator();
+ }
+ public boolean contains(Object o) {
+ if (!(o instanceof Map.Entry))
+ return false;
+ Map.Entry e = (Map.Entry)o;
+ Object key = e.getKey();
+ if (key==null) return false; // we don't support null keys
+ Object val = e.getValue();
+ Object v = get(key);
+ return v==null ? val==null : v.equals(val);
+ }
+ public boolean remove(Object o) {
+ throw new UnsupportedOperationException();
+ }
+ public int size() {
+ return count;
+ }
+ public void clear() {
+ CharArrayMap.this.clear();
+ }
+ }
+
+ @Override
+ public Object clone() {
+ CharArrayMap<V> map = null;
+ try {
+ map = (CharArrayMap<V>)super.clone();
+ map.keys = keys.clone();
+ map.values = values.clone();
+ } catch (CloneNotSupportedException e) {
+ // impossible
+ }
+ return map;
+ }
+}
Property changes on:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/CharArrayMap.java
___________________________________________________________________
Name: svn:executable
+ *
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/Constants.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/Constants.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/Constants.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,13 @@
+package org.apache.solr.util;
+
+/**
+ * @author Hardy Ferentschik
+ */
+public class Constants {
+ public static final String LUCENE_MATCH_VERSION_PARAM = "luceneMatchVersion";
+
+ private Constants() {
+ }
+}
+
+
Added:
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/plugin/ResourceLoaderAware.java
===================================================================
---
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/plugin/ResourceLoaderAware.java
(rev 0)
+++
search/trunk/hibernate-search-solr-analyzers/src/main/java/org/apache/solr/util/plugin/ResourceLoaderAware.java 2010-09-17
09:15:09 UTC (rev 20659)
@@ -0,0 +1,28 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.util.plugin;
+
+import org.apache.solr.common.ResourceLoader;
+
+/**
+ * @since solr 1.3
+ */
+public interface ResourceLoaderAware
+{
+ void inform( ResourceLoader loader );
+}
Modified: search/trunk/hibernate-search-testing/pom.xml
===================================================================
--- search/trunk/hibernate-search-testing/pom.xml 2010-09-17 09:09:44 UTC (rev 20658)
+++ search/trunk/hibernate-search-testing/pom.xml 2010-09-17 09:15:09 UTC (rev 20659)
@@ -39,7 +39,7 @@
<dependencies>
<dependency>
- <groupId>${groupId}</groupId>
+ <groupId>${project.groupId}</groupId>
<artifactId>hibernate-search</artifactId>
<version>${project.version}</version>
</dependency>
Modified: search/trunk/pom.xml
===================================================================
--- search/trunk/pom.xml 2010-09-17 09:09:44 UTC (rev 20658)
+++ search/trunk/pom.xml 2010-09-17 09:15:09 UTC (rev 20659)
@@ -22,7 +22,9 @@
~ 51 Franklin Street, Fifth Floor
~ Boston, MA 02110-1301 USA
-->
-<project
xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
+<project
xmlns="http://maven.apache.org/POM/4.0.0"
+
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.hibernate</groupId>
@@ -37,6 +39,7 @@
<inceptionYear>2006</inceptionYear>
<modules>
+ <module>hibernate-search-solr-analyzers</module>
<module>hibernate-search</module>
<module>hibernate-search-testing</module>
<module>hibernate-search-archetype</module>
@@ -127,7 +130,7 @@
<properties>
<slf4jVersion>1.6.1</slf4jVersion>
- <luceneVersion>3.1-dev</luceneVersion>
+ <luceneVersion>3.0.2</luceneVersion>
<hibernateVersion>3.6.0.Beta3</hibernateVersion>
<hibernateCommonsAnnotationVersion>3.2.0.Final</hibernateCommonsAnnotationVersion>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -170,6 +173,37 @@
<version>${luceneVersion}</version>
</dependency>
<dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-collation</artifactId>
+ <version>${luceneVersion}</version>
+ <exclusions>
+ <exclusion>
+ <groupId>com.ibm.icu</groupId>
+ <artifactId>icu4j</artifactId>
+ </exclusion>
+ </exclusions>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-spellchecker</artifactId>
+ <version>${luceneVersion}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.hibernate</groupId>
+ <artifactId>hibernate-search-solr-analyzers</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-codec</groupId>
+ <artifactId>commons-codec</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>1.4</version>
+ </dependency>
+ <dependency>
<groupId>javax.transaction</groupId>
<artifactId>jta</artifactId>
<version>1.1</version>
@@ -187,11 +221,6 @@
</dependency>
<dependency>
<groupId>org.hibernate</groupId>
- <artifactId>hibernate-annotations</artifactId>
- <version>${hibernateVersion}</version>
- </dependency>
- <dependency>
- <groupId>org.hibernate</groupId>
<artifactId>hibernate-entitymanager</artifactId>
<version>${hibernateVersion}</version>
</dependency>
@@ -201,81 +230,10 @@
<version>${hibernateVersion}</version>
</dependency>
<dependency>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-core</artifactId>
- <version>3.1-dev</version>
- <exclusions>
- <exclusion>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- </exclusion>
- <exclusion>
- <groupId>woodstox</groupId>
- <artifactId>wstx-asl</artifactId>
- </exclusion>
- <exclusion>
- <groupId>net.java.dev.stax-utils</groupId>
- <artifactId>stax-utils</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-logging</groupId>
- <artifactId>commons-logging</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-lucene-core</artifactId>
- </exclusion>
- <exclusion>
- <groupId>commons-fileupload</groupId>
- <artifactId>commons-fileupload</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.solr</groupId>
- <artifactId>solr-commons-csv</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-highlighter</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-queries</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-memory</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-misc</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.codehaus.woodstox</groupId>
- <artifactId>wstx-asl</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.geronimo.specs</groupId>
- <artifactId>geronimo-stax-api_1.0_spec</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-spellchecker</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.apache.lucene</groupId>
- <artifactId>lucene-analyzers</artifactId>
- </exclusion>
- <!--<exclusion>-->
- <!--<groupId>org.apache.lucene</groupId>-->
-
<!--<artifactId>lucene-snowball</artifactId>-->
- <!--</exclusion>-->
- </exclusions>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-snowball</artifactId>
+ <version>${luceneVersion}</version>
</dependency>
- <!--<dependency>-->
- <!--<groupId>org.apache.lucene</groupId>-->
- <!--<artifactId>lucene-snowball</artifactId>-->
- <!--<version>${luceneVersion}</version>-->
- <!--</dependency>-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
@@ -491,8 +449,8 @@
<jdbc.driver>org.hsqldb.jdbcDriver</jdbc.driver>
<jdbc.url>jdbc:hsqldb:.</jdbc.url>
<jdbc.user>sa</jdbc.user>
- <jdbc.pass />
- <jdbc.isolation />
+ <jdbc.pass/>
+ <jdbc.isolation/>
</properties>
</profile>
<!--
@@ -519,7 +477,7 @@
<
jdbc.url>jdbc:mysql://vmg08.mw.lab.eng.bos.redhat.com/searctru</jdb...
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -539,7 +497,7 @@
<
jdbc.url>jdbc:mysql://vmg02.mw.lab.eng.bos.redhat.com/searctru</jdb...
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -560,7 +518,7 @@
</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -580,7 +538,7 @@
<jdbc.url>jdbc:postgresql://vmg01.mw.lab.eng.bos.redhat.com:5432:searctru</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -600,7 +558,7 @@
<jdbc.url>jdbc:postgresql://vmg03.mw.lab.eng.bos.redhat.com:5432:searctru</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -620,7 +578,7 @@
<jdbc.url>jdbc:postgresql://notinstalled.lab.eng.bos.redhat.com:5432:searctru</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -651,7 +609,7 @@
<jdbc.url>jdbc:db2://dev32.qa.atl.jboss.com:50000/jbossqa</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -676,7 +634,7 @@
<jdbc.url>jdbc:db2://dev67.qa.atl.jboss.com:50000/jbossqa</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -701,7 +659,7 @@
<jdbc.url>jdbc:db2://vmg06.mw.lab.eng.bos.redhat.com:50000/jbossqa</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -722,7 +680,7 @@
<jdbc.url>jdbc:oracle:thin:@dev20.qa.atl.jboss.com:1521:qa</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -743,7 +701,7 @@
<jdbc.url>jdbc:oracle:thin:@vmg05.mw.lab.eng.bos.redhat.com:1521:qaora10</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -763,7 +721,7 @@
<jdbc.url>jdbc:oracle:thin:@dev04.qa.atl2.redhat.com:1521:qaora11</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -785,7 +743,7 @@
</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -805,7 +763,7 @@
<jdbc.url>jdbc:sybase:Tds:vmg07.mw.lab.eng.bos.redhat.com:5000/searctru</jdbc.url>
<jdbc.user>searctru</jdbc.user>
<jdbc.pass>searctru</jdbc.pass>
- <jdbc.isolation />
+ <jdbc.isolation/>
</properties>
</profile>
@@ -817,7 +775,7 @@
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>msjdbc</artifactId>
<version>2.0.1008.2</version>
- <classifier>4</classifier>
+ <classifier>4</classifier>
</dependency>
</dependencies>
<properties>
@@ -838,7 +796,7 @@
<groupId>com.microsoft.sqlserver</groupId>
<artifactId>msjdbc</artifactId>
<version>2.0.1008.2</version>
- <classifier>4</classifier>
+ <classifier>4</classifier>
</dependency>
</dependencies>
<properties>