[hibernate-commits] Hibernate SVN: r15536 - in search/trunk/src: java/org/hibernate/search/filter and 1 other directories.

hibernate-commits at lists.jboss.org hibernate-commits at lists.jboss.org
Sun Nov 9 11:23:25 EST 2008


Author: sannegrinovero
Date: 2008-11-09 11:23:24 -0500 (Sun, 09 Nov 2008)
New Revision: 15536

Added:
   search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
Modified:
   search/trunk/src/java/org/hibernate/search/engine/FilterDef.java
   search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java
   search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java
Log:
HSEARCH-289 testcases and some speedup

Modified: search/trunk/src/java/org/hibernate/search/engine/FilterDef.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/engine/FilterDef.java	2008-11-09 06:34:45 UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/engine/FilterDef.java	2008-11-09 16:23:24 UTC (rev 15536)
@@ -11,7 +11,7 @@
 import org.hibernate.search.annotations.FullTextFilterDef;
 
 /**
- * A wrapper class which encapsualtes all required information to create a defined filter.
+ * A wrapper class which encapsulates all required information to create a defined filter.
  * 
  * @author Emmanuel Bernard
  */

Modified: search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java	2008-11-09 06:34:45 UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/filter/AndDocIdSet.java	2008-11-09 16:23:24 UTC (rev 15536)
@@ -1,30 +1,32 @@
 package org.hibernate.search.filter;
 
 import java.io.IOException;
-import java.util.BitSet;
 import java.util.List;
 import static java.lang.Math.max;
 
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.util.DocIdBitSet;
+import org.apache.lucene.util.OpenBitSet;
 
 /**
- * A DocIdSet built as applying "AND" operation to a list of other DocIdSet.
+ * A DocIdSet built as applying "AND" operation to a list of other DocIdSet(s).
  * The DocIdSetIterator returned will return only document ids contained
- * in all DocIdSet handed to the constructor.
+ * in all DocIdSet(s) handed to the constructor.
  * 
  * @author Sanne Grinovero
  */
 public class AndDocIdSet extends DocIdSet {
 	
-	private DocIdBitSet docIdBitSet;
+	private OpenBitSet docIdBitSet;
 	private final List<DocIdSet> andedDocIdSets;
 	
-	public AndDocIdSet(List<DocIdSet> andedDocIdSets) {
+	private final int maxDocNumber;
+	
+	public AndDocIdSet(List<DocIdSet> andedDocIdSets, int maxDocs) {
 		if ( andedDocIdSets == null || andedDocIdSets.size() < 2 )
-			throw new IllegalArgumentException( "To \"and\" some DocIdSet they should be at least 2" );
+			throw new IllegalArgumentException( "To \"and\" some DocIdSet(s) they should be at least 2" );
 		this.andedDocIdSets = andedDocIdSets;
+		this.maxDocNumber = maxDocs;
 	}
 	
 	private synchronized void buildBitset() throws IOException {
@@ -33,71 +35,72 @@
 		//TODO if some andedDocIdSets are DocIdBitSet, merge them first.
 		int size = andedDocIdSets.size();
 		DocIdSetIterator[] iterators = new DocIdSetIterator[size];
-		int[] positions = new int[size];
 		boolean valuesExist = true;
-		int maxIndex = 0;
 		for (int i=0; i<size; i++) {
 			// build all iterators
-			DocIdSetIterator iterator = andedDocIdSets.get(i).iterator();
-			iterators[i] = iterator;
-			// and move to first position
-			boolean nextExists = iterator.next();
-			if ( ! nextExists ) {
-				valuesExist = false;
-				break;
-			}
-			int currentFilterValue = iterator.doc();
-			positions[i] = currentFilterValue;
-			// find the initial maximum position
-			maxIndex = max( maxIndex, currentFilterValue );
+			iterators[i] = andedDocIdSets.get(i).iterator();
 		}
-		BitSet bitSet = new BitSet();
+		OpenBitSet bitSet;
 		if ( valuesExist ) { // skip further processing if some idSet is empty
-			do {
-				if ( allSame( positions ) ) {
-					// enable a bit if all idSets agree on it:
-					bitSet.set( maxIndex );
-					maxIndex++;
-				}
-				maxIndex = advance( iterators, positions, maxIndex );
-			} while ( maxIndex != -1 ); // -1 means the end of some bitSet has been reached (end condition)
+			bitSet = new OpenBitSet( maxDocNumber );
+			markBitSetOnAgree( iterators, bitSet );
 		}
-		docIdBitSet = new DocIdBitSet( bitSet );
+		else {
+			bitSet = new OpenBitSet(); //TODO a less expensive "empty"
+		}
+		docIdBitSet = bitSet;
 	}
 
-	/**
-	 * Have all DocIdSetIterator having current doc id minor than currentMaxPosition
-	 * skip to at least this position.
-	 * @param iterators
-	 * @param positions
-	 * @return maximum position of all DocIdSetIterator after the operation, or -1 when at least one reached the end.
-	 * @throws IOException 
-	 */
-	private final int advance(final DocIdSetIterator[] iterators, final int[] positions, int currentMaxPosition) throws IOException {
-		for (int i=0; i<positions.length; i++) {
-			if ( positions[i] != currentMaxPosition ) {
-				boolean validPosition = iterators[i].skipTo( currentMaxPosition );
-				if ( ! validPosition )
-					return -1;
-				positions[i] = iterators[i].doc();
-				currentMaxPosition = max( currentMaxPosition, positions[i] );
+	private final void markBitSetOnAgree(final DocIdSetIterator[] iterators, final OpenBitSet result) throws IOException {
+		final int iteratorSize = iterators.length;
+		int targetPosition = Integer.MIN_VALUE;
+		int votes = 0;
+		// Each iterator can vote "ok" for the current target to
+		// be reached; when all agree the bit is set.
+		// if an iterator disagrees (it jumped longer), it's current position becomes the new targetPosition
+		// for the others and he is considered "first" in the voting round (every iterator votes for himself ;-)
+		int i = 0;
+		//iterator initialize, just one "next" for each DocIdSetIterator
+		for ( ;i<iteratorSize; i++ ) {
+			final DocIdSetIterator iterator = iterators[i];
+			if ( ! iterator.next() ) return; //current iterator has no values, so skip all
+			final int position = iterator.doc();
+			if ( targetPosition==position ) {
+				votes++; //stopped as same position of others
 			}
+			else {
+				targetPosition = max( targetPosition, position );
+				if (targetPosition==position) //means it changed
+					votes=1;
+			}
 		}
-		return currentMaxPosition;
-	}
-
-	/**
-	 * see if all DocIdSetIterator stopped at the same position.
-	 * @param positions the array of current positions.
-	 * @return true if all DocIdSetIterator agree on the current docId.
-	 */
-	private final boolean allSame(final int[] positions) {
-		int base = positions[0];
-		for (int i=1; i<positions.length; i++) {
-			if ( base != positions[i] )
-				return false;
+		// end iterator initialize
+		if (votes==iteratorSize) {
+			result.fastSet( targetPosition );
+			targetPosition++;
 		}
-		return true;
+		i=0;
+		votes=0; //could be smarted but would make the code even more complex for a minor optimization out of cycle.
+		// enter main loop:
+		while ( true ) {
+			final DocIdSetIterator iterator = iterators[i];
+			final boolean validPosition = iterator.skipTo( targetPosition );
+			if ( ! validPosition )
+				return; //exit condition
+			final int position = iterator.doc();
+			if ( position == targetPosition ) {
+				if ( ++votes == iteratorSize ) {
+					result.fastSet( position );
+					votes = 0;
+					targetPosition++;
+				}
+			}
+			else {
+				votes = 1;
+				targetPosition = position;
+			}
+			i = ++i % iteratorSize;
+		}
 	}
 
 	@Override

Modified: search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java
===================================================================
--- search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java	2008-11-09 06:34:45 UTC (rev 15535)
+++ search/trunk/src/java/org/hibernate/search/filter/ChainedFilter.java	2008-11-09 16:23:24 UTC (rev 15536)
@@ -52,7 +52,7 @@
 			for ( Filter f : chainedFilters ) {
 				subSets.add( f.getDocIdSet( reader ) );
 			}
-			return new AndDocIdSet( subSets );
+			return new AndDocIdSet( subSets, reader.maxDoc() );
 		}
 	}
 

Added: search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
===================================================================
--- search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java	                        (rev 0)
+++ search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java	2008-11-09 16:23:24 UTC (rev 15536)
@@ -0,0 +1,246 @@
+package org.hibernate.search.test.filter;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.BitSet;
+import java.util.Collections;
+import java.util.List;
+import java.util.Random;
+
+import org.apache.lucene.search.DocIdSet;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.util.DocIdBitSet;
+import org.hibernate.search.filter.AndDocIdSet;
+
+import junit.framework.TestCase;
+
+/**
+ * Functionality testcase for org.hibernate.search.filter.AndDocIdSet.
+ * There is a main method to run some very approximate performance
+ * comparisons with the use of java.util.BitSet ands.
+ * The numbers show the AndDocIdSet should be used only when it's not
+ * possible to rely on a BitSet; in this class however we use BitSet
+ * as it's useful to test the implementation.
+ * 
+ * @see AndDocIdSet
+ * @see BitSet
+ * @author Sanne Grinovero
+ */
+public class AndDocIdSetsTest extends TestCase {
+	
+	static final List<Integer> testDataFrom0to9 =  toImmutableList( 0,1,2,3,4,5,6,7,8,9 );
+	static final List<Integer> testDataFrom1to10 = toImmutableList( 1,2,3,4,5,6,7,8,9,10 );
+	static final List<Integer> testDataFrom1to9 = toImmutableList( 1,2,3,4,5,6,7,8,9 );
+
+	private static final List<Integer> toImmutableList(int... is) {
+		List<Integer> l = new ArrayList<Integer>( is.length );
+		for ( int i=0; i<is.length; i++ ) {
+			l.add( Integer.valueOf( is[i] ) );
+		}
+		return Collections.unmodifiableList( l );
+	}
+	
+	@SuppressWarnings("unchecked")
+	List<Integer> andLists(List<Integer>... lists) {
+		if (lists.length==0) {
+			return Collections.EMPTY_LIST;
+		}
+		List<Integer> result = new ArrayList<Integer>( lists[0] );
+		for (int i=1; i<lists.length; i++) {
+			result.retainAll( lists[i] );
+		}
+		return result;
+	}
+	
+	// auto-testing of test utility methods for AND operations on test arrays
+	@SuppressWarnings("unchecked")
+	public void testAndingArrays() {
+		List<Integer> andLists = andLists( testDataFrom0to9, testDataFrom1to10 );
+		assertTrue( andLists.containsAll( testDataFrom1to9 ) );
+		assertFalse( andLists.contains( Integer.valueOf( 0 ) ) );
+		assertFalse( andLists.contains( Integer.valueOf( 10 ) ) );
+		assertTrue( andLists.equals( testDataFrom1to9 ) );
+		DocIdSet docIdSet0_9 = arrayToDocIdSet(testDataFrom0to9);
+		DocIdSet docIdSet1_10 = arrayToDocIdSet(testDataFrom1to10);
+		DocIdSet docIdSet1_9 = arrayToDocIdSet(testDataFrom1to9);
+		assertTrue( docIdSetsEqual( docIdSet0_9, docIdSet0_9 ) );
+		assertTrue( docIdSetsEqual( docIdSet1_10, docIdSet1_10 ) );
+		assertFalse( docIdSetsEqual( docIdSet1_10, docIdSet1_9 ) );
+		assertFalse( docIdSetsEqual( docIdSet0_9, docIdSet1_9 ) );
+	}
+	
+	// auto-testing of test utility methods for conversion in DocIdSetIterator
+	public void testIteratorMatchesTestArray() throws IOException {
+		DocIdSet docIdSet0_9 = arrayToDocIdSet(testDataFrom0to9);
+		DocIdSetIterator docIdSetIterator = docIdSet0_9.iterator();
+		assertTrue( docIdSetIterator.next() );
+		assertEquals( 0, docIdSetIterator.doc() );
+		assertTrue( docIdSetIterator.skipTo(9) );
+		assertFalse( docIdSetIterator.skipTo(10) );
+	}
+	
+	public void testAndDocIdSets() {
+		List<DocIdSet> filters = new ArrayList<DocIdSet>( 2 );
+		filters.add( arrayToDocIdSet( testDataFrom0to9 ) );
+		filters.add( arrayToDocIdSet( testDataFrom1to10 ) );
+		DocIdSet expected = arrayToDocIdSet( testDataFrom1to9 );
+		DocIdSet testedSet = new AndDocIdSet( filters, 10 );
+		assertTrue( docIdSetsEqual( expected, testedSet ) );
+	}
+	
+	public void testOnRandomBigArrays(){
+		onRandomBigArraysTest( 13L );
+		onRandomBigArraysTest( 9L );
+		onRandomBigArraysTest( 71L );
+	}
+	
+	public void onRandomBigArraysTest(long randomSeed) {
+		List<BitSet> filtersData = makeRandomBitSetList( randomSeed, 4, 1000000, 1500000 );
+		BitSet expectedBitset = applyANDOnBitSets( filtersData );
+		List<DocIdSet> filters = toDocIdSetList( filtersData );
+		DocIdBitSet expectedDocIdSet = new DocIdBitSet( expectedBitset );
+		DocIdSet testedSet = new AndDocIdSet( filters, 1500000 );
+		assertTrue( docIdSetsEqual(expectedDocIdSet, testedSet) );
+	}
+	
+	private static List<DocIdSet> toDocIdSetList(List<BitSet> filtersData) {
+		List<DocIdSet> docIdSets = new ArrayList<DocIdSet>( filtersData.size() );
+		for (BitSet bitSet : filtersData) {
+			docIdSets.add ( new DocIdBitSet(bitSet) );
+		}
+		return docIdSets;
+	}
+
+	public static void main(String[] args) throws IOException {
+		compareAndingPerformance( 8, 1000000, 1500000 );
+		compareAndingPerformance( 4, 1000000, 1500000 );
+		compareAndingPerformance( 2, 1000000, 1500000 );
+		compareAndingPerformance( 2, 100000000, 150000000 );
+		compareAndingPerformance( 4, 100000000, 150000000 );
+		compareAndingPerformance( 8, 100000000, 150000000 );
+	}
+	
+	private static void compareAndingPerformance(final int listSize,
+			final int minBitsSize, final int maxBitsSize) throws IOException {
+		List<BitSet> filtersData = makeRandomBitSetList( 13L, listSize, minBitsSize, maxBitsSize );
+		DocIdSet andedByBitsResult = null;
+		DocIdSet andedByIterationResult = null;
+		{
+			long startTime = System.currentTimeMillis();
+			for ( int i=0; i<1000; i++ ) {
+				BitSet expectedBitset = applyANDOnBitSets( filtersData );
+				andedByBitsResult = new DocIdBitSet( expectedBitset );
+				// iteration is needed to have a fair comparison with other impl:
+				iterateOnResults( andedByBitsResult );
+			}
+			long totalTimeMs = System.currentTimeMillis() - startTime;
+			System.out.println( "Time to \"AND " + listSize + 
+				" BitSets and iterate on results\" 1000 times: " +
+				totalTimeMs + "ms. (" +
+				minBitsSize +" minimum BitSet size)");
+		}
+		List<DocIdSet> docIdSetList = toDocIdSetList( filtersData );
+		{
+			long startTime = System.currentTimeMillis();
+			for ( int i=0; i<1000; i++ ) {
+				andedByIterationResult = new AndDocIdSet( docIdSetList, maxBitsSize );
+				// iteration is needed because the AND is been done lazily on iterator access:
+				iterateOnResults( andedByIterationResult );
+			}
+			long totalTimeMs = System.currentTimeMillis() - startTime;
+			System.out.println( "Time to \"use AndDocIdSet iterator on " + listSize + 
+				" Filters and iterate on results\" 1000 times: " +
+				totalTimeMs + "ms. (" +
+				minBitsSize +" minimum BitSet size)");
+		}
+		System.out.println(" Results are same: " + docIdSetsEqual( andedByBitsResult, andedByIterationResult ) );
+	}
+
+	private static int iterateOnResults(DocIdSet docIdBitSet) throws IOException {
+		DocIdSetIterator iterator = docIdBitSet.iterator();
+		int i = 0;
+		while ( iterator.next() ) {
+			i += iterator.doc();
+		}
+		return i;
+	}
+
+	private static final BitSet applyANDOnBitSets(final List<BitSet> filtersData) {
+		BitSet andedBitSet = null;
+		for (BitSet bits : filtersData) {
+			if ( andedBitSet==null ) {
+				andedBitSet = (BitSet) bits.clone();
+			}
+			else {
+				andedBitSet.and( bits );
+			}
+		}
+		return andedBitSet;
+	}
+
+	private static List<BitSet> makeRandomBitSetList(final long randomSeed, final int listSize,
+			final int minBitsSize, final int maxBitsSize) {
+		Random r = new Random( randomSeed ); //have a fixed Seed for repeatable tests
+		List<BitSet> restulList = new ArrayList<BitSet>( listSize );
+		for (int i=0; i<listSize; i++) {
+			int arraySize = minBitsSize + r.nextInt( maxBitsSize-minBitsSize );
+			restulList.add( makeRandomBitSet( r, arraySize) );
+		}
+		return restulList;
+	}
+	
+	private static BitSet makeRandomBitSet(final Random randomSource, final int maxSize){
+		BitSet bitSet = new BitSet();
+		for ( int datai=0; datai<maxSize; datai++ ) {
+			// each bit has 50% change to be set:
+			if ( randomSource.nextBoolean() ) bitSet.set( datai );
+		}
+		return bitSet;
+	}
+	
+	/**
+	 * converts a list of Integers representing Document ids
+	 * into a Lucene DocIdSet
+	 * @param docIdList
+	 * @return
+	 */
+	public DocIdSet arrayToDocIdSet(List<Integer> docIdList) {
+		BitSet bitset = new BitSet();
+		for (int i : docIdList) {
+			bitset.set(i);
+		}
+		return new DocIdBitSet(bitset);
+	}
+	
+	/**
+	 * @param a
+	 * @param b
+	 * @return true if the two DocIdSet are equal: contain the same number of ids, same order and all are equal
+	 */
+	public static final boolean docIdSetsEqual(DocIdSet expected, DocIdSet tested) {
+		DocIdSetIterator iterA = expected.iterator();
+		DocIdSetIterator iterB = tested.iterator();
+		boolean nextA = false;
+		boolean nextB = false;
+		try{
+			do {
+				nextA = iterA.next();
+				nextB = iterB.next();
+				if ( nextA!=nextB ) {
+					return false;
+				}
+				else if ( nextA==false) {
+					return true;
+				}
+				else if ( iterA.doc() != iterB.doc() ) {
+					return false;
+				}
+			} while ( nextA && nextB );
+		}
+		catch (IOException ioe) {
+			fail( "these DocIdSetIterator instances should not throw any exceptions" );
+		}
+		return true;
+	}
+
+}


Property changes on: search/trunk/src/test/org/hibernate/search/test/filter/AndDocIdSetsTest.java
___________________________________________________________________
Name: svn:executable
   + *




More information about the hibernate-commits mailing list