Author: sannegrinovero
Date: 2008-11-09 11:23:24 -0500 (Sun, 09 Nov 2008)
New Revision: 15536
Added:
search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
Modified:
search/trunk/src/java/org/hibernate/search/engine/FilterDef.java
search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java
search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java
Log:
HSEARCH-289 testcases and some speedup
Modified: search/trunk/src/java/org/hibernate/search/engine/FilterDef.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/engine/FilterDef.java 2008-11-09 06:34:45
UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/engine/FilterDef.java 2008-11-09 16:23:24
UTC (rev 15536)
@@ -11,7 +11,7 @@
import org.hibernate.search.annotations.FullTextFilterDef;
/**
- * A wrapper class which encapsualtes all required information to create a defined
filter.
+ * A wrapper class which encapsulates all required information to create a defined
filter.
*
* @author Emmanuel Bernard
*/
Modified: search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java 2008-11-09 06:34:45
UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java 2008-11-09 16:23:24
UTC (rev 15536)
@@ -1,30 +1,32 @@
package org.hibernate.search.filter;
import java.io.IOException;
-import java.util.BitSet;
import java.util.List;
import static java.lang.Math.max;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.DocIdBitSet;
+import org.apache.lucene.util.OpenBitSet;
/**
- * A DocIdSet built as applying "AND" operation to a list of other DocIdSet.
+ * A DocIdSet built as applying "AND" operation to a list of other
DocIdSet(s).
* The DocIdSetIterator returned will return only document ids contained
- * in all DocIdSet handed to the constructor.
+ * in all DocIdSet(s) handed to the constructor.
*
* @author Sanne Grinovero
*/
public class AndDocIdSet extends DocIdSet {
- private DocIdBitSet docIdBitSet;
+ private OpenBitSet docIdBitSet;
private final List<DocIdSet> andedDocIdSets;
- public AndDocIdSet(List<DocIdSet> andedDocIdSets) {
+ private final int maxDocNumber;
+
+ public AndDocIdSet(List<DocIdSet> andedDocIdSets, int maxDocs) {
if ( andedDocIdSets == null || andedDocIdSets.size() < 2 )
- throw new IllegalArgumentException( "To \"and\" some DocIdSet they
should be at least 2" );
+ throw new IllegalArgumentException( "To \"and\" some DocIdSet(s) they
should be at least 2" );
this.andedDocIdSets = andedDocIdSets;
+ this.maxDocNumber = maxDocs;
}
private synchronized void buildBitset() throws IOException {
@@ -33,71 +35,72 @@
//TODO if some andedDocIdSets are DocIdBitSet, merge them first.
int size = andedDocIdSets.size();
DocIdSetIterator[] iterators = new DocIdSetIterator[size];
- int[] positions = new int[size];
boolean valuesExist = true;
- int maxIndex = 0;
for (int i=0; i<size; i++) {
// build all iterators
- DocIdSetIterator iterator = andedDocIdSets.get(i).iterator();
- iterators[i] = iterator;
- // and move to first position
- boolean nextExists = iterator.next();
- if ( ! nextExists ) {
- valuesExist = false;
- break;
- }
- int currentFilterValue = iterator.doc();
- positions[i] = currentFilterValue;
- // find the initial maximum position
- maxIndex = max( maxIndex, currentFilterValue );
+ iterators[i] = andedDocIdSets.get(i).iterator();
}
- BitSet bitSet = new BitSet();
+ OpenBitSet bitSet;
if ( valuesExist ) { // skip further processing if some idSet is empty
- do {
- if ( allSame( positions ) ) {
- // enable a bit if all idSets agree on it:
- bitSet.set( maxIndex );
- maxIndex++;
- }
- maxIndex = advance( iterators, positions, maxIndex );
- } while ( maxIndex != -1 ); // -1 means the end of some bitSet has been reached (end
condition)
+ bitSet = new OpenBitSet( maxDocNumber );
+ markBitSetOnAgree( iterators, bitSet );
}
- docIdBitSet = new DocIdBitSet( bitSet );
+ else {
+ bitSet = new OpenBitSet(); //TODO a less expensive "empty"
+ }
+ docIdBitSet = bitSet;
}
- /**
- * Have all DocIdSetIterator having current doc id minor than currentMaxPosition
- * skip to at least this position.
- * @param iterators
- * @param positions
- * @return maximum position of all DocIdSetIterator after the operation, or -1 when at
least one reached the end.
- * @throws IOException
- */
- private final int advance(final DocIdSetIterator[] iterators, final int[] positions, int
currentMaxPosition) throws IOException {
- for (int i=0; i<positions.length; i++) {
- if ( positions[i] != currentMaxPosition ) {
- boolean validPosition = iterators[i].skipTo( currentMaxPosition );
- if ( ! validPosition )
- return -1;
- positions[i] = iterators[i].doc();
- currentMaxPosition = max( currentMaxPosition, positions[i] );
+ private final void markBitSetOnAgree(final DocIdSetIterator[] iterators, final
OpenBitSet result) throws IOException {
+ final int iteratorSize = iterators.length;
+ int targetPosition = Integer.MIN_VALUE;
+ int votes = 0;
+ // Each iterator can vote "ok" for the current target to
+ // be reached; when all agree the bit is set.
+ // if an iterator disagrees (it jumped longer), it's current position becomes the
new targetPosition
+ // for the others and he is considered "first" in the voting round (every
iterator votes for himself ;-)
+ int i = 0;
+ //iterator initialize, just one "next" for each DocIdSetIterator
+ for ( ;i<iteratorSize; i++ ) {
+ final DocIdSetIterator iterator = iterators[i];
+ if ( ! iterator.next() ) return; //current iterator has no values, so skip all
+ final int position = iterator.doc();
+ if ( targetPosition==position ) {
+ votes++; //stopped as same position of others
}
+ else {
+ targetPosition = max( targetPosition, position );
+ if (targetPosition==position) //means it changed
+ votes=1;
+ }
}
- return currentMaxPosition;
- }
-
- /**
- * see if all DocIdSetIterator stopped at the same position.
- * @param positions the array of current positions.
- * @return true if all DocIdSetIterator agree on the current docId.
- */
- private final boolean allSame(final int[] positions) {
- int base = positions[0];
- for (int i=1; i<positions.length; i++) {
- if ( base != positions[i] )
- return false;
+ // end iterator initialize
+ if (votes==iteratorSize) {
+ result.fastSet( targetPosition );
+ targetPosition++;
}
- return true;
+ i=0;
+ votes=0; //could be smarted but would make the code even more complex for a minor
optimization out of cycle.
+ // enter main loop:
+ while ( true ) {
+ final DocIdSetIterator iterator = iterators[i];
+ final boolean validPosition = iterator.skipTo( targetPosition );
+ if ( ! validPosition )
+ return; //exit condition
+ final int position = iterator.doc();
+ if ( position == targetPosition ) {
+ if ( ++votes == iteratorSize ) {
+ result.fastSet( position );
+ votes = 0;
+ targetPosition++;
+ }
+ }
+ else {
+ votes = 1;
+ targetPosition = position;
+ }
+ i = ++i % iteratorSize;
+ }
}
@Override
Modified: search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java 2008-11-09
06:34:45 UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java 2008-11-09
16:23:24 UTC (rev 15536)
@@ -52,7 +52,7 @@
for ( Filter f : chainedFilters ) {
subSets.add( f.getDocIdSet( reader ) );
}
- return new AndDocIdSet( subSets );
+ return new AndDocIdSet( subSets, reader.maxDoc() );
}
}
Added: search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
===================================================================
--- search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
(rev 0)
+++
search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java 2008-11-09
16:23:24 UTC (rev 15536)
@@ -0,0 +1,246 @@
+package org.hibernate.search.test.filter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.DocIdBitSet;
+import org.hibernate.search.filter.AndDocIdSet;
+
+import junit.framework.TestCase;
+
+/**
+ * Functionality testcase for org.hibernate.search.filter.AndDocIdSet.
+ * There is a main method to run some very approximate performance
+ * comparisons with the use of java.util.BitSet ands.
+ * The numbers show the AndDocIdSet should be used only when it's not
+ * possible to rely on a BitSet; in this class however we use BitSet
+ * as it's useful to test the implementation.
+ *
+ * @see AndDocIdSet
+ * @see BitSet
+ * @author Sanne Grinovero
+ */
+public class AndDocIdSetsTest extends TestCase {
+
+ static final List<Integer> testDataFrom0to9 = toImmutableList(
0,1,2,3,4,5,6,7,8,9 );
+ static final List<Integer> testDataFrom1to10 = toImmutableList(
1,2,3,4,5,6,7,8,9,10 );
+ static final List<Integer> testDataFrom1to9 = toImmutableList( 1,2,3,4,5,6,7,8,9
);
+
+ private static final List<Integer> toImmutableList(int... is) {
+ List<Integer> l = new ArrayList<Integer>( is.length );
+ for ( int i=0; i<is.length; i++ ) {
+ l.add( Integer.valueOf( is[i] ) );
+ }
+ return Collections.unmodifiableList( l );
+ }
+
+ @SuppressWarnings("unchecked")
+ List<Integer> andLists(List<Integer>... lists) {
+ if (lists.length==0) {
+ return Collections.EMPTY_LIST;
+ }
+ List<Integer> result = new ArrayList<Integer>( lists[0] );
+ for (int i=1; i<lists.length; i++) {
+ result.retainAll( lists[i] );
+ }
+ return result;
+ }
+
+ // auto-testing of test utility methods for AND operations on test arrays
+ @SuppressWarnings("unchecked")
+ public void testAndingArrays() {
+ List<Integer> andLists = andLists( testDataFrom0to9, testDataFrom1to10 );
+ assertTrue( andLists.containsAll( testDataFrom1to9 ) );
+ assertFalse( andLists.contains( Integer.valueOf( 0 ) ) );
+ assertFalse( andLists.contains( Integer.valueOf( 10 ) ) );
+ assertTrue( andLists.equals( testDataFrom1to9 ) );
+ DocIdSet docIdSet0_9 = arrayToDocIdSet(testDataFrom0to9);
+ DocIdSet docIdSet1_10 = arrayToDocIdSet(testDataFrom1to10);
+ DocIdSet docIdSet1_9 = arrayToDocIdSet(testDataFrom1to9);
+ assertTrue( docIdSetsEqual( docIdSet0_9, docIdSet0_9 ) );
+ assertTrue( docIdSetsEqual( docIdSet1_10, docIdSet1_10 ) );
+ assertFalse( docIdSetsEqual( docIdSet1_10, docIdSet1_9 ) );
+ assertFalse( docIdSetsEqual( docIdSet0_9, docIdSet1_9 ) );
+ }
+
+ // auto-testing of test utility methods for conversion in DocIdSetIterator
+ public void testIteratorMatchesTestArray() throws IOException {
+ DocIdSet docIdSet0_9 = arrayToDocIdSet(testDataFrom0to9);
+ DocIdSetIterator docIdSetIterator = docIdSet0_9.iterator();
+ assertTrue( docIdSetIterator.next() );
+ assertEquals( 0, docIdSetIterator.doc() );
+ assertTrue( docIdSetIterator.skipTo(9) );
+ assertFalse( docIdSetIterator.skipTo(10) );
+ }
+
+ public void testAndDocIdSets() {
+ List<DocIdSet> filters = new ArrayList<DocIdSet>( 2 );
+ filters.add( arrayToDocIdSet( testDataFrom0to9 ) );
+ filters.add( arrayToDocIdSet( testDataFrom1to10 ) );
+ DocIdSet expected = arrayToDocIdSet( testDataFrom1to9 );
+ DocIdSet testedSet = new AndDocIdSet( filters, 10 );
+ assertTrue( docIdSetsEqual( expected, testedSet ) );
+ }
+
+ public void testOnRandomBigArrays(){
+ onRandomBigArraysTest( 13L );
+ onRandomBigArraysTest( 9L );
+ onRandomBigArraysTest( 71L );
+ }
+
+ public void onRandomBigArraysTest(long randomSeed) {
+ List<BitSet> filtersData = makeRandomBitSetList( randomSeed, 4, 1000000, 1500000
);
+ BitSet expectedBitset = applyANDOnBitSets( filtersData );
+ List<DocIdSet> filters = toDocIdSetList( filtersData );
+ DocIdBitSet expectedDocIdSet = new DocIdBitSet( expectedBitset );
+ DocIdSet testedSet = new AndDocIdSet( filters, 1500000 );
+ assertTrue( docIdSetsEqual(expectedDocIdSet, testedSet) );
+ }
+
+ private static List<DocIdSet> toDocIdSetList(List<BitSet> filtersData) {
+ List<DocIdSet> docIdSets = new ArrayList<DocIdSet>( filtersData.size() );
+ for (BitSet bitSet : filtersData) {
+ docIdSets.add ( new DocIdBitSet(bitSet) );
+ }
+ return docIdSets;
+ }
+
+ public static void main(String[] args) throws IOException {
+ compareAndingPerformance( 8, 1000000, 1500000 );
+ compareAndingPerformance( 4, 1000000, 1500000 );
+ compareAndingPerformance( 2, 1000000, 1500000 );
+ compareAndingPerformance( 2, 100000000, 150000000 );
+ compareAndingPerformance( 4, 100000000, 150000000 );
+ compareAndingPerformance( 8, 100000000, 150000000 );
+ }
+
+ private static void compareAndingPerformance(final int listSize,
+ final int minBitsSize, final int maxBitsSize) throws IOException {
+ List<BitSet> filtersData = makeRandomBitSetList( 13L, listSize, minBitsSize,
maxBitsSize );
+ DocIdSet andedByBitsResult = null;
+ DocIdSet andedByIterationResult = null;
+ {
+ long startTime = System.currentTimeMillis();
+ for ( int i=0; i<1000; i++ ) {
+ BitSet expectedBitset = applyANDOnBitSets( filtersData );
+ andedByBitsResult = new DocIdBitSet( expectedBitset );
+ // iteration is needed to have a fair comparison with other impl:
+ iterateOnResults( andedByBitsResult );
+ }
+ long totalTimeMs = System.currentTimeMillis() - startTime;
+ System.out.println( "Time to \"AND " + listSize +
+ " BitSets and iterate on results\" 1000 times: " +
+ totalTimeMs + "ms. (" +
+ minBitsSize +" minimum BitSet size)");
+ }
+ List<DocIdSet> docIdSetList = toDocIdSetList( filtersData );
+ {
+ long startTime = System.currentTimeMillis();
+ for ( int i=0; i<1000; i++ ) {
+ andedByIterationResult = new AndDocIdSet( docIdSetList, maxBitsSize );
+ // iteration is needed because the AND is been done lazily on iterator access:
+ iterateOnResults( andedByIterationResult );
+ }
+ long totalTimeMs = System.currentTimeMillis() - startTime;
+ System.out.println( "Time to \"use AndDocIdSet iterator on " + listSize
+
+ " Filters and iterate on results\" 1000 times: " +
+ totalTimeMs + "ms. (" +
+ minBitsSize +" minimum BitSet size)");
+ }
+ System.out.println(" Results are same: " + docIdSetsEqual( andedByBitsResult,
andedByIterationResult ) );
+ }
+
+ private static int iterateOnResults(DocIdSet docIdBitSet) throws IOException {
+ DocIdSetIterator iterator = docIdBitSet.iterator();
+ int i = 0;
+ while ( iterator.next() ) {
+ i += iterator.doc();
+ }
+ return i;
+ }
+
+ private static final BitSet applyANDOnBitSets(final List<BitSet> filtersData) {
+ BitSet andedBitSet = null;
+ for (BitSet bits : filtersData) {
+ if ( andedBitSet==null ) {
+ andedBitSet = (BitSet) bits.clone();
+ }
+ else {
+ andedBitSet.and( bits );
+ }
+ }
+ return andedBitSet;
+ }
+
+ private static List<BitSet> makeRandomBitSetList(final long randomSeed, final int
listSize,
+ final int minBitsSize, final int maxBitsSize) {
+ Random r = new Random( randomSeed ); //have a fixed Seed for repeatable tests
+ List<BitSet> restulList = new ArrayList<BitSet>( listSize );
+ for (int i=0; i<listSize; i++) {
+ int arraySize = minBitsSize + r.nextInt( maxBitsSize-minBitsSize );
+ restulList.add( makeRandomBitSet( r, arraySize) );
+ }
+ return restulList;
+ }
+
+ private static BitSet makeRandomBitSet(final Random randomSource, final int maxSize){
+ BitSet bitSet = new BitSet();
+ for ( int datai=0; datai<maxSize; datai++ ) {
+ // each bit has 50% change to be set:
+ if ( randomSource.nextBoolean() ) bitSet.set( datai );
+ }
+ return bitSet;
+ }
+
+ /**
+ * converts a list of Integers representing Document ids
+ * into a Lucene DocIdSet
+ * @param docIdList
+ * @return
+ */
+ public DocIdSet arrayToDocIdSet(List<Integer> docIdList) {
+ BitSet bitset = new BitSet();
+ for (int i : docIdList) {
+ bitset.set(i);
+ }
+ return new DocIdBitSet(bitset);
+ }
+
+ /**
+ * @param a
+ * @param b
+ * @return true if the two DocIdSet are equal: contain the same number of ids, same
order and all are equal
+ */
+ public static final boolean docIdSetsEqual(DocIdSet expected, DocIdSet tested) {
+ DocIdSetIterator iterA = expected.iterator();
+ DocIdSetIterator iterB = tested.iterator();
+ boolean nextA = false;
+ boolean nextB = false;
+ try{
+ do {
+ nextA = iterA.next();
+ nextB = iterB.next();
+ if ( nextA!=nextB ) {
+ return false;
+ }
+ else if ( nextA==false) {
+ return true;
+ }
+ else if ( iterA.doc() != iterB.doc() ) {
+ return false;
+ }
+ } while ( nextA && nextB );
+ }
+ catch (IOException ioe) {
+ fail( "these DocIdSetIterator instances should not throw any exceptions" );
+ }
+ return true;
+ }
+
+}
Property changes on:
search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
___________________________________________________________________
Name: svn:executable
+ *