Author: skabashnyuk
Date: 2009-10-01 05:14:03 -0400 (Thu, 01 Oct 2009)
New Revision: 190
Added:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/LuceneSpellChecker.java
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/WordNetSynonyms.java
Modified:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/SpellChecker.java
Log:
EXOJCR-161 : Spell checker and synonim provider
Modified:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/SpellChecker.java
===================================================================
---
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/SpellChecker.java 2009-10-01
09:01:38 UTC (rev 189)
+++
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/SpellChecker.java 2009-10-01
09:14:03 UTC (rev 190)
@@ -16,23 +16,26 @@
*/
package org.exoplatform.services.jcr.impl.core.query.lucene;
-import java.io.IOException;
-
import org.exoplatform.services.jcr.impl.core.query.QueryHandler;
import org.exoplatform.services.jcr.impl.core.query.QueryRootNode;
+import java.io.IOException;
+
+import javax.jcr.RepositoryException;
+
/**
- * <code>SpellChecker</code> defines an interface to run a spellchecker over
- * a fulltext query statement.
+ * <code>SpellChecker</code> defines an interface to run a spellchecker over
a
+ * fulltext query statement.
*/
public interface SpellChecker {
/**
* Initializes this spell checker with an abstract query tree.
- *
- * @param handler the query handler that created this spell checker.
- * @throws IOException if an error occurs while initializing the spell
- * checker.
+ *
+ * @param handler
+ * the query handler that created this spell checker.
+ * @throws IOException
+ * if an error occurs while initializing the spell checker.
*/
void init(QueryHandler handler) throws IOException;
@@ -42,14 +45,16 @@
* spellchecker thinks the words are misspelled. If the spellchecker
* determines that the words are spelled correctly <code>null</code> is
* returned.
- *
- * @param aqt the abstract query tree, which may contain a relation query
+ *
+ * @param aqt
+ * the abstract query tree, which may contain a relation query
* node with a spellcheck operation.
* @return a suggestion or <code>null</code> if this spell checker
* determines that the fulltext query statement is spelled
* correctly.
+ * @throws RepositoryException
*/
- String check(QueryRootNode aqt) throws IOException;
+ String check(QueryRootNode aqt) throws IOException, RepositoryException;
/**
* Closes this spell checker and allows it to free resources.
Added:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/LuceneSpellChecker.java
===================================================================
---
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/LuceneSpellChecker.java
(rev 0)
+++
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/LuceneSpellChecker.java 2009-10-01
09:14:03 UTC (rev 190)
@@ -0,0 +1,436 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.exoplatform.services.jcr.impl.core.query.lucene.spell;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.spell.Dictionary;
+import org.apache.lucene.search.spell.LuceneDictionary;
+import org.apache.lucene.search.spell.SpellChecker;
+import org.apache.lucene.store.AlreadyClosedException;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.NativeFSLockFactory;
+import org.exoplatform.services.jcr.impl.core.query.QueryHandler;
+import org.exoplatform.services.jcr.impl.core.query.QueryRootNode;
+import org.exoplatform.services.jcr.impl.core.query.RelationQueryNode;
+import org.exoplatform.services.jcr.impl.core.query.TraversingQueryNodeVisitor;
+import org.exoplatform.services.jcr.impl.core.query.lucene.FieldNames;
+import org.exoplatform.services.jcr.impl.core.query.lucene.SearchIndex;
+import org.exoplatform.services.log.ExoLogger;
+import org.exoplatform.services.log.Log;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+
+import javax.jcr.RepositoryException;
+
+/**
+ * <code>LuceneSpellChecker</code> implements a spell checker based on the
terms
+ * present in a lucene index.
+ */
+public class LuceneSpellChecker implements
+ org.exoplatform.services.jcr.impl.core.query.lucene.SpellChecker {
+
+ /**
+ * Logger instance for this class.
+ */
+ private static final Log log = ExoLogger
+ .getLogger(LuceneSpellChecker.class);
+
+ public static final class FiveSecondsRefreshInterval extends
+ LuceneSpellChecker {
+ public FiveSecondsRefreshInterval() {
+ super(5 * 1000);
+ }
+ }
+
+ public static final class OneMinuteRefreshInterval extends
+ LuceneSpellChecker {
+ public OneMinuteRefreshInterval() {
+ super(60 * 1000);
+ }
+ }
+
+ public static final class FiveMinutesRefreshInterval extends
+ LuceneSpellChecker {
+ public FiveMinutesRefreshInterval() {
+ super(5 * 60 * 1000);
+ }
+ }
+
+ public static final class ThirtyMinutesRefreshInterval extends
+ LuceneSpellChecker {
+ public ThirtyMinutesRefreshInterval() {
+ super(30 * 60 * 1000);
+ }
+ }
+
+ public static final class OneHourRefreshInterval extends LuceneSpellChecker {
+ public OneHourRefreshInterval() {
+ super(60 * 60 * 1000);
+ }
+ }
+
+ public static final class SixHoursRefreshInterval extends
+ LuceneSpellChecker {
+ public SixHoursRefreshInterval() {
+ super(6 * 60 * 60 * 1000);
+ }
+ }
+
+ public static final class TwelveHoursRefreshInterval extends
+ LuceneSpellChecker {
+ public TwelveHoursRefreshInterval() {
+ super(12 * 60 * 60 * 1000);
+ }
+ }
+
+ public static final class OneDayRefreshInterval extends LuceneSpellChecker {
+ public OneDayRefreshInterval() {
+ super(24 * 60 * 60 * 1000);
+ }
+ }
+
+ /**
+ * The internal spell checker.
+ */
+ private InternalSpellChecker spellChecker;
+
+ /**
+ * The refresh interval.
+ */
+ private final long refreshInterval;
+
+ /**
+ * Spell checker with a default refresh interval of one hour.
+ */
+ public LuceneSpellChecker() {
+ this(60 * 60 * 1000); // default refresh interval: one hour
+ }
+
+ protected LuceneSpellChecker(long refreshInterval) {
+ this.refreshInterval = refreshInterval;
+ }
+
+ /**
+ * Initializes this spell checker.
+ *
+ * @param handler
+ * the query handler that created this spell checker.
+ * @throws IOException
+ * if <code>handler</code> is not of type {@link
SearchIndex}.
+ */
+ public void init(QueryHandler handler) throws IOException {
+ if (handler instanceof SearchIndex) {
+ this.spellChecker = new InternalSpellChecker((SearchIndex) handler);
+ } else {
+ throw new IOException("LuceneSpellChecker only works with "
+ + SearchIndex.class.getName());
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ *
+ * @throws RepositoryException
+ */
+ public String check(QueryRootNode aqt) throws IOException,
+ RepositoryException {
+ String stmt = getFulltextStatement(aqt);
+ if (stmt == null) {
+ // no spellcheck operation in query
+ return null;
+ }
+ return spellChecker.suggest(stmt);
+ }
+
+ public void close() {
+ spellChecker.close();
+ }
+
+ // ------------------------------< internal
+ // >--------------------------------
+
+ /**
+ * Returns the fulltext statement of a spellcheck relation query node or
+ * <code>null</code> if none exists in the abstract query tree.
+ *
+ * @param aqt
+ * the abstract query tree.
+ * @return the fulltext statement or <code>null</code>.
+ * @throws RepositoryException
+ */
+ private String getFulltextStatement(QueryRootNode aqt)
+ throws RepositoryException {
+ final String[] stmt = new String[1];
+ aqt.accept(new TraversingQueryNodeVisitor() {
+ public Object visit(RelationQueryNode node, Object o)
+ throws RepositoryException {
+ if (stmt[0] == null
+ && node.getOperation() == RelationQueryNode.OPERATION_SPELLCHECK) {
+ stmt[0] = node.getStringValue();
+ }
+ return super.visit(node, o);
+ }
+ }, null);
+ return stmt[0];
+ }
+
+ private final class InternalSpellChecker {
+
+ /**
+ * Timestamp when the last refresh was done.
+ */
+ private long lastRefresh;
+
+ /**
+ * Set to true while a refresh is done in a separate thread.
+ */
+ private boolean refreshing = false;
+
+ /**
+ * The query handler associated with this spell checker.
+ */
+ private final SearchIndex handler;
+
+ /**
+ * The directory where the spell index is stored.
+ */
+ private final Directory spellIndexDirectory;
+
+ /**
+ * The underlying spell checker.
+ */
+ private SpellChecker spellChecker;
+
+ /**
+ * Creates a new internal spell checker.
+ *
+ * @param handler
+ * the associated query handler.
+ */
+ InternalSpellChecker(SearchIndex handler) throws IOException {
+ this.handler = handler;
+ String path = handler.getContext().getIndexDirectory()
+ + File.separatorChar + "spellchecker";
+ this.spellIndexDirectory = FSDirectory.getDirectory(path,
+ new NativeFSLockFactory(path));
+ if (IndexReader.indexExists(spellIndexDirectory)) {
+ this.lastRefresh = System.currentTimeMillis();
+ }
+ this.spellChecker = new SpellChecker(spellIndexDirectory);
+ refreshSpellChecker();
+ }
+
+ /**
+ * Checks a fulltext query statement and suggests a spell checked
+ * version of the statement. If the spell checker thinks the spelling is
+ * correct <code>null</code> is returned.
+ *
+ * @param statement
+ * the fulltext query statement.
+ * @return a suggestion or <code>null</code>.
+ */
+ String suggest(String statement) throws IOException {
+ // tokenize the statement (field name doesn't matter actually...)
+ List<String> words = new ArrayList<String>();
+ List<Token> tokens = new ArrayList<Token>();
+ tokenize(statement, words, tokens);
+
+ String[] suggestions = check(words
+ .toArray(new String[words.size()]));
+ if (suggestions != null) {
+ // replace words in statement in reverse order because length
+ // of statement will change
+ StringBuffer sb = new StringBuffer(statement);
+ for (int i = suggestions.length - 1; i >= 0; i--) {
+ Token t = tokens.get(i);
+ // only replace if word acutally changed
+ if (!t.termText().equalsIgnoreCase(suggestions[i])) {
+ sb.replace(t.startOffset(), t.endOffset(),
+ suggestions[i]);
+ }
+ }
+ return sb.toString();
+ } else {
+ return null;
+ }
+ }
+
+ void close() {
+ try {
+ spellIndexDirectory.close();
+ } catch (IOException e) {
+ // ignore
+ }
+ // urgh, the lucene spell checker cannot be closed explicitly.
+ // finalize will close the reader...
+ spellChecker = null;
+ }
+
+ /**
+ * Tokenizes the statement into words and tokens.
+ *
+ * @param statement
+ * the fulltext query statement.
+ * @param words
+ * this list will be filled with the original words extracted
+ * from the statement.
+ * @param tokens
+ * this list will be filled with the tokens parsed from the
+ * statement.
+ * @throws IOException
+ * if an error occurs while parsing the statement.
+ */
+ private void tokenize(String statement, List<String> words,
+ List<Token> tokens) throws IOException {
+ TokenStream ts = handler.getTextAnalyzer().tokenStream(
+ FieldNames.FULLTEXT, new StringReader(statement));
+ try {
+ Token t;
+ while ((t = ts.next()) != null) {
+ String origWord = statement.substring(t.startOffset(), t
+ .endOffset());
+ if (t.getPositionIncrement() > 0) {
+ words.add(t.termText());
+ tokens.add(t);
+ } else {
+ // very simple implementation: use termText with length
+ // closer to original word
+ Token current = tokens.get(tokens.size() - 1);
+ if (Math.abs(origWord.length()
+ - current.termText().length()) > Math
+ .abs(origWord.length() - t.termText().length())) {
+ // replace current token and word
+ words.set(words.size() - 1, t.termText());
+ tokens.set(tokens.size() - 1, t);
+ }
+ }
+ }
+ } finally {
+ ts.close();
+ }
+ }
+
+ /**
+ * Checks the spelling of the passed <code>words</code> and returns a
+ * suggestion.
+ *
+ * @param words
+ * the words to check.
+ * @return a suggestion of correctly spelled <code>words</code> or
+ * <code>null</code> if this spell checker thinks
+ * <code>words</code> are spelled correctly.
+ * @throws IOException
+ * if an error occurs while spell checking.
+ */
+ private String[] check(String words[]) throws IOException {
+ refreshSpellChecker();
+ boolean hasSuggestion = false;
+ IndexReader reader = handler.getIndexReader();
+ try {
+ for (int retries = 0; retries < 100; retries++) {
+ try {
+ String[] suggestion = new String[words.length];
+ for (int i = 0; i < words.length; i++) {
+ String[] similar = spellChecker.suggestSimilar(
+ words[i], 5, reader, FieldNames.FULLTEXT,
+ true);
+ if (similar.length > 0) {
+ suggestion[i] = similar[0];
+ hasSuggestion = true;
+ } else {
+ suggestion[i] = words[i];
+ }
+ }
+ if (hasSuggestion) {
+ log.debug("Successful after "
+ + new Integer(retries) + " retries");
+ return suggestion;
+ } else {
+ return null;
+ }
+ } catch (AlreadyClosedException e) {
+ // it may happen that the index reader inside the
+ // spell checker is closed while searching for
+ // suggestions. this is actually a design flaw in the
+ // lucene spell checker, but for now we simply retry
+ }
+ }
+ // unsuccessful after retries
+ return null;
+ } finally {
+ reader.close();
+ }
+ }
+
+ /**
+ * Refreshes the underlying spell checker in a background thread.
+ * Synchronization is done on this <code>LuceneSpellChecker</code>
+ * instance. While the refresh takes place {@link #refreshing} is set to
+ * <code>true</code>.
+ */
+ private void refreshSpellChecker() {
+ if (lastRefresh + refreshInterval < System.currentTimeMillis()) {
+ synchronized (this) {
+ if (refreshing) {
+ return;
+ } else {
+ refreshing = true;
+ Runnable refresh = new Runnable() {
+ public void run() {
+ try {
+ IndexReader reader = handler
+ .getIndexReader();
+ try {
+ long time = System.currentTimeMillis();
+ Dictionary dict = new LuceneDictionary(
+ reader, FieldNames.FULLTEXT);
+ log
+ .debug("Starting spell checker index refresh");
+ spellChecker.indexDictionary(dict);
+ time = System.currentTimeMillis()
+ - time;
+ time = time / 1000;
+ log
+ .info("Spell checker index refreshed in: "
+ + new Long(time)
+ + " s.");
+ } finally {
+ reader.close();
+ synchronized (InternalSpellChecker.this) {
+ refreshing = false;
+ }
+ }
+ } catch (IOException e) {
+ // ignore
+ }
+ }
+ };
+ new Thread(refresh, "SpellChecker Refresh").start();
+ lastRefresh = System.currentTimeMillis();
+ }
+ }
+ }
+ }
+ }
+}
Property changes on:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/spell/LuceneSpellChecker.java
___________________________________________________________________
Name: svn:mime-type
+ text/plain
Added:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/WordNetSynonyms.java
===================================================================
---
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/WordNetSynonyms.java
(rev 0)
+++
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/WordNetSynonyms.java 2009-10-01
09:14:03 UTC (rev 190)
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *
http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.exoplatform.services.jcr.impl.core.query.lucene.synonym;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.lucene.index.memory.SynonymMap;
+
+import org.exoplatform.services.jcr.impl.core.query.lucene.SynonymProvider;
+
+/**
+ * <code>WordNetSynonyms</code> implements a {@link SynonymProvider} that is
backed by the WordNet
+ * prolog file <a
href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz"&...;.
+ */
+public class WordNetSynonyms implements SynonymProvider
+{
+
+ /**
+ * The synonym map or <code>null</code> if an error occurred while reading
the prolog file.
+ */
+ private SynonymMap SYNONYM_MAP;
+
+ /**
+ * {@inheritDoc}
+ */
+ public void initialize(InputStream configuration) throws IOException
+ {
+
+ SynonymMap sm = null;
+ try
+ {
+ sm = new SynonymMap(configuration);
+ }
+ catch (IOException e)
+ {
+ // ignore
+ }
+ SYNONYM_MAP = sm;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String[] getSynonyms(String string)
+ {
+ if (SYNONYM_MAP != null)
+ {
+ return SYNONYM_MAP.getSynonyms(string.toLowerCase());
+ }
+ else
+ {
+ return new String[0];
+ }
+ }
+}
Property changes on:
jcr/trunk/component/core/src/main/java/org/exoplatform/services/jcr/impl/core/query/lucene/synonym/WordNetSynonyms.java
___________________________________________________________________
Name: svn:mime-type
+ text/plain