[
https://issues.jboss.org/browse/ISPN-9291?page=com.atlassian.jira.plugin....
]
Dan Berindei commented on ISPN-9291:
------------------------------------
I have modified {{Partition.installMergeView()}} to compute the digest explicitly and it
fixed {{CrashedNodeDuringConflictResolutionTest}}. Unfortunately it also caused
{{*TxPartitionAndMergeDuring*Test.testPrimaryOwnerIsolatedPartitionWithDiscard()}} to fail
every time:
{noformat}
18:52:04,747 TRACE (testng-test:[]) [OptimisticTxPartitionAndMergeDuringCommitTest] Local
tx=[], remote tx=[GlobalTx:test-NodeA-39763:3], for cache test-NodeC-19138
18:52:04,749 ERROR (testng-test:[]) [TestSuiteProgress] Test failed:
org.infinispan.partitionhandling.OptimisticTxPartitionAndMergeDuringCommitTest.testPrimaryOwnerIsolatedPartitionWithDiscard[DIST_SYNC]
java.lang.AssertionError: There are pending transactions!
at org.testng.AssertJUnit.fail(AssertJUnit.java:59)
at org.testng.AssertJUnit.assertTrue(AssertJUnit.java:24)
at
org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:253)
at
org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:393)
at
org.infinispan.test.MultipleCacheManagersTest.assertNoTransactions(MultipleCacheManagersTest.java:877)
at
org.infinispan.partitionhandling.BaseTxPartitionAndMergeTest.finalAsserts(BaseTxPartitionAndMergeTest.java:100)
at
org.infinispan.partitionhandling.BaseOptimisticTxPartitionAndMergeTest.doTest(BaseOptimisticTxPartitionAndMergeTest.java:84)
at
org.infinispan.partitionhandling.OptimisticTxPartitionAndMergeDuringCommitTest.testPrimaryOwnerIsolatedPartitionWithDiscard(OptimisticTxPartitionAndMergeDuringCommitTest.java:43)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at
org.testng.internal.MethodInvocationHelper.invokeMethod(MethodInvocationHelper.java:85)
at
org.testng.internal.MethodInvocationHelper$1.runTestMethod(MethodInvocationHelper.java:196)
at org.infinispan.commons.test.TestNGLongTestsHook.run(TestNGLongTestsHook.java:24)
at
org.testng.internal.MethodInvocationHelper.invokeHookable(MethodInvocationHelper.java:208)
at org.testng.internal.Invoker.invokeMethod(Invoker.java:635)
at org.testng.internal.Invoker.invokeTestMethod(Invoker.java:816)
at org.testng.internal.Invoker.invokeTestMethods(Invoker.java:1124)
at org.testng.internal.TestMethodWorker.invokeTestMethods(TestMethodWorker.java:125)
at org.testng.internal.TestMethodWorker.run(TestMethodWorker.java:108)
at org.testng.TestRunner.privateRun(TestRunner.java:774)
at org.testng.TestRunner.run(TestRunner.java:624)
at org.testng.SuiteRunner.runTest(SuiteRunner.java:359)
at org.testng.SuiteRunner.runSequentially(SuiteRunner.java:354)
at org.testng.SuiteRunner.privateRun(SuiteRunner.java:312)
at org.testng.SuiteRunner.run(SuiteRunner.java:261)
at org.testng.SuiteRunnerWorker.runSuite(SuiteRunnerWorker.java:52)
at org.testng.SuiteRunnerWorker.run(SuiteRunnerWorker.java:86)
at org.testng.TestNG.runSuitesSequentially(TestNG.java:1215)
at org.testng.TestNG.runSuitesLocally(TestNG.java:1140)
at org.testng.TestNG.run(TestNG.java:1048)
at org.testng.IDEARemoteTestNG.run(IDEARemoteTestNG.java:72)
at org.testng.RemoteTestNGStarter.main(RemoteTestNGStarter.java:123)
18:52:04,749 ERROR (testng-test) [TestSuiteProgress] Test failed:
org.infinispan.partitionhandling.OptimisticTxPartitionAndMergeDuringCommitTest.testPrimaryOwnerIsolatedPartitionWithDiscard[DIST_SYNC]
java.lang.AssertionError: There are pending transactions!
at org.testng.AssertJUnit.fail(AssertJUnit.java:59) ~[testng-6.9.9.jar:?]
at org.testng.AssertJUnit.assertTrue(AssertJUnit.java:24) ~[testng-6.9.9.jar:?]
at org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:253)
~[test-classes/:?]
at org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:393)
~[test-classes/:?]
at
org.infinispan.test.MultipleCacheManagersTest.assertNoTransactions(MultipleCacheManagersTest.java:877)
~[test-classes/:?]
at
org.infinispan.partitionhandling.BaseTxPartitionAndMergeTest.finalAsserts(BaseTxPartitionAndMergeTest.java:100)
~[test-classes/:?]
at
org.infinispan.partitionhandling.BaseOptimisticTxPartitionAndMergeTest.doTest(BaseOptimisticTxPartitionAndMergeTest.java:84)
~[test-classes/:?]
at
org.infinispan.partitionhandling.OptimisticTxPartitionAndMergeDuringCommitTest.testPrimaryOwnerIsolatedPartitionWithDiscard(OptimisticTxPartitionAndMergeDuringCommitTest.java:43)
~[test-classes/:?]
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) ~[?:1.8.0_171]
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
~[?:1.8.0_171]
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
~[?:1.8.0_171]
at java.lang.reflect.Method.invoke(Method.java:498) ~[?:1.8.0_171]
at
org.testng.internal.MethodInvocationHelper.invokeMethod(MethodInvocationHelper.java:85)
~[testng-6.9.9.jar:?]
at
org.testng.internal.MethodInvocationHelper$1.runTestMethod(MethodInvocationHelper.java:196)
~[testng-6.9.9.jar:?]
at org.infinispan.commons.test.TestNGLongTestsHook.run(TestNGLongTestsHook.java:24)
~[classes/:?]
at
org.testng.internal.MethodInvocationHelper.invokeHookable(MethodInvocationHelper.java:208)
~[testng-6.9.9.jar:?]
at org.testng.internal.Invoker.invokeMethod(Invoker.java:635) [testng-6.9.9.jar:?]
at org.testng.internal.Invoker.invokeTestMethod(Invoker.java:816) [testng-6.9.9.jar:?]
at org.testng.internal.Invoker.invokeTestMethods(Invoker.java:1124) [testng-6.9.9.jar:?]
at org.testng.internal.TestMethodWorker.invokeTestMethods(TestMethodWorker.java:125)
[testng-6.9.9.jar:?]
at org.testng.internal.TestMethodWorker.run(TestMethodWorker.java:108)
[testng-6.9.9.jar:?]
at org.testng.TestRunner.privateRun(TestRunner.java:774) [testng-6.9.9.jar:?]
at org.testng.TestRunner.run(TestRunner.java:624) [testng-6.9.9.jar:?]
at org.testng.SuiteRunner.runTest(SuiteRunner.java:359) [testng-6.9.9.jar:?]
at org.testng.SuiteRunner.runSequentially(SuiteRunner.java:354) [testng-6.9.9.jar:?]
at org.testng.SuiteRunner.privateRun(SuiteRunner.java:312) [testng-6.9.9.jar:?]
at org.testng.SuiteRunner.run(SuiteRunner.java:261) [testng-6.9.9.jar:?]
at org.testng.SuiteRunnerWorker.runSuite(SuiteRunnerWorker.java:52) [testng-6.9.9.jar:?]
at org.testng.SuiteRunnerWorker.run(SuiteRunnerWorker.java:86) [testng-6.9.9.jar:?]
at org.testng.TestNG.runSuitesSequentially(TestNG.java:1215) [testng-6.9.9.jar:?]
at org.testng.TestNG.runSuitesLocally(TestNG.java:1140) [testng-6.9.9.jar:?]
at org.testng.TestNG.run(TestNG.java:1048) [testng-6.9.9.jar:?]
at org.testng.IDEARemoteTestNG.run(IDEARemoteTestNG.java:72) [testng-plugin.jar:?]
at org.testng.RemoteTestNGStarter.main(RemoteTestNGStarter.java:123)
[testng-plugin.jar:?]
java.lang.AssertionError: There are pending transactions!
at org.testng.AssertJUnit.fail(AssertJUnit.java:59)
at org.testng.AssertJUnit.assertTrue(AssertJUnit.java:24)
at
org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:253)
at
org.infinispan.test.AbstractInfinispanTest.eventually(AbstractInfinispanTest.java:393)
at
org.infinispan.test.MultipleCacheManagersTest.assertNoTransactions(MultipleCacheManagersTest.java:877)
at
org.infinispan.partitionhandling.BaseTxPartitionAndMergeTest.finalAsserts(BaseTxPartitionAndMergeTest.java:100)
at
org.infinispan.partitionhandling.BaseOptimisticTxPartitionAndMergeTest.doTest(BaseOptimisticTxPartitionAndMergeTest.java:84)
at
org.infinispan.partitionhandling.OptimisticTxPartitionAndMergeDuringCommitTest.testPrimaryOwnerIsolatedPartitionWithDiscard(OptimisticTxPartitionAndMergeDuringCommitTest.java:43)
{noformat}
The problem seems to be that the majority partition installs a stable topology and
completes the transaction before the merge. The test is passing without the merge digest
because the {{TxCompletionNotificationCommand}} is retransmitted by the originator to the
minority partition, but with the digest set properly that no longer happens:
{noformat}
18:51:53,997 TRACE (transport-thread-test-NodeA-p4-t4:[Topology-opt-cache])
[PartitionHandlingManagerImpl] Check if topology CacheTopology{id=19, phase=NO_REBALANCE,
rebalanceId=5, currentCH=DefaultConsistentHash{ns=256, owners = (3)[test-NodeA-39763:
86+88, test-NodeB-22220: 90+98, test-NodeD-9579: 80+70]}, pendingCH=null, unionCH=null,
actualMembers=[test-NodeA-39763, test-NodeB-22220, test-NodeD-9579],
persistentUUIDs=[8122bb4d-95a4-49c0-8ce2-e3cc1eeeaa80,
53b50f34-0360-415c-ac9b-dcf01fffda4e, b29574c5-335d-4215-9e53-4f345443a0ee]} is stable.
Last stable topology is CacheTopology{id=19, phase=NO_REBALANCE, rebalanceId=5,
currentCH=DefaultConsistentHash{ns=256, owners = (3)[test-NodeA-39763: 86+88,
test-NodeB-22220: 90+98, test-NodeD-9579: 80+70]}, pendingCH=null, unionCH=null,
actualMembers=[test-NodeA-39763, test-NodeB-22220, test-NodeD-9579],
persistentUUIDs=[8122bb4d-95a4-49c0-8ce2-e3cc1eeeaa80,
53b50f34-0360-415c-ac9b-dcf01fffda4e, b29574c5-335d-4215-9e53-4f345443a0ee!
]}
18:51:53,997 TRACE (transport-thread-test-NodeA-p4-t4:[Topology-opt-cache])
[PartitionHandlingManagerImpl] On stable topology update. Pending txs: 1
18:51:53,997 TRACE (transport-thread-test-NodeA-p4-t4:[Topology-opt-cache])
[JGroupsTransport] test-NodeA-39763 sending request 31 to [test-NodeB-22220]:
VersionedCommitCommand{gtx=GlobalTx:test-NodeA-39763:3, cacheName='opt-cache',
topologyId=19,
updatedVersions={MagicKey#k1{0/90946C35/91@test-NodeB-22220}=SimpleClusteredVersion{topologyId=13,
version=2},
MagicKey#k2{1/BDAAE358/148@test-NodeC-19138}=SimpleClusteredVersion{topologyId=13,
version=2}}}
18:51:54,006 TRACE (jgroups-8,test-NodeA-39763:[]) [JGroupsTransport] test-NodeA-39763
received response for request 31 from test-NodeB-22220: SuccessfulResponse(null)
18:51:54,006 TRACE (jgroups-8,test-NodeA-39763:[]) [PartitionHandlingManagerImpl]
Performing cleanup for transaction GlobalTx:test-NodeA-39763:3
18:51:54,006 TRACE (jgroups-8,test-NodeA-39763:[]) [JGroupsTransport] test-NodeA-39763
sending command to all: TxCompletionNotificationCommand{ xid=null, internalId=0,
topologyId=19, gtx=GlobalTx:test-NodeA-39763:3, cacheName=opt-cache}
18:51:54,111 INFO (testng-test:[]) [CLUSTER] ISPN000093: Received new, MERGED cluster
view for channel ISPN: MergeView::[test-NodeC-19138|10] (4) [test-NodeC-19138,
test-NodeA-39763, test-NodeB-22220, test-NodeD-9579], 2 subgroups: [test-NodeC-19138|8]
(1) [test-NodeC-19138], [test-NodeA-39763|9] (3) [test-NodeA-39763, test-NodeB-22220,
test-NodeD-9579]
{noformat}
[~pruivo] I'm not sure whether we should consider this a sub-case of ISPN-8232 or if
it's worth its own issue.
BasePartitionHandlingTest.Partition.installMergeView() doesn't
compute the merge digest
---------------------------------------------------------------------------------------
Key: ISPN-9291
URL:
https://issues.jboss.org/browse/ISPN-9291
Project: Infinispan
Issue Type: Bug
Components: Test Suite - Core
Affects Versions: 9.3.0.CR1
Reporter: Dan Berindei
Assignee: Dan Berindei
Labels: testsuite_stability
Fix For: 9.3.0.Final
The partition handling tests use
{{BasePartitionHandlingTest.Partition.installMergeView(view1, view2)}} to install the
merge view without waiting for {{MERGE3}} to run, making them much faster. Unfortunately,
the implementation is incorrect: {{GMS.installView(view)}} only works for regular views,
merge views need to be installed with {{GMS.installView(mergeView, digest)}}.
The result is that the nodes that got isolated from the coordinator request the
retransmission of all the {{NAKACK2}} messages (including view updates) since the cluster
first started. The isolated nodes cannot install the merge view until they deliver all the
older messages (even without knowing whether they're OOB or not). But if {{STABLE}}
ran and cleared a range of messages already, the retransmission request cannot be
satisfied, so the view updates will never be delivered.
This is easily reproducible in {{CrashedNodeDuringConflictResolutionTest}} if we add a
delay before updating the topology in {{StateConsumerImpl}}. The test installs the merge
view manually, but then kills NodeC and expects the cluster to install the new view
automatically. NodeD can't install the new view because it's waiting for earlier
messages from NodeA:
{noformat}
18:27:13,054 INFO (testng-test:[]) [TestSuiteProgress] Test starting:
org.infinispan.conflict.impl.CrashedNodeDuringConflictResolutionTest.testPartitionMergePolicy[DIST_SYNC]
18:27:13,640 DEBUG (testng-test:[]) [GMS] test-NodeA-39513: installing view
MergeView::[test-NodeA-39513|10] (4) [test-NodeA-39513, test-NodeB-9439, test-NodeC-43706,
test-NodeD-59078], 2 subgroups: [test-NodeA-39513|8] (2) [test-NodeA-39513,
test-NodeB-9439], [test-NodeC-43706|9] (2) [test-NodeC-43706, test-NodeD-59078]
18:27:13,674 DEBUG (testng-test:[]) [GMS] test-NodeD-59078: installing view
MergeView::[test-NodeA-39513|10] (4) [test-NodeA-39513, test-NodeB-9439, test-NodeC-43706,
test-NodeD-59078], 2 subgroups: [test-NodeA-39513|8] (2) [test-NodeA-39513,
test-NodeB-9439], [test-NodeC-43706|9] (2) [test-NodeC-43706, test-NodeD-59078]
18:27:13,828 TRACE (jgroups-7,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078: sending
XMIT_REQ ((1): {50}) to test-NodeA-39513
18:27:13,966 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((49): {1-49}) to test-NodeA-39513
18:27:14,067 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
18:27:14,504 DEBUG (testng-test:[]) [DefaultCacheManager] Stopping cache manager ISPN on
test-NodeC-43706
18:27:18,642 TRACE (VERIFY_SUSPECT.TimerThread-89,test-NodeA-39513:[]) [GMS]
test-NodeA-39513: joiners=[], suspected=[test-NodeC-43706], leaving=[], new view:
[test-NodeA-39513|11] (3) [test-NodeA-39513, test-NodeB-9439, test-NodeD-59078]
18:27:18,643 TRACE (VERIFY_SUSPECT.TimerThread-89,test-NodeA-39513:[]) [GMS]
test-NodeA-39513: mcasting view [test-NodeA-39513|11] (3) [test-NodeA-39513,
test-NodeB-9439, test-NodeD-59078]
18:27:18,646 DEBUG (VERIFY_SUSPECT.TimerThread-89,test-NodeA-39513:[]) [GMS]
test-NodeA-39513: installing view [test-NodeA-39513|11] (3) [test-NodeA-39513,
test-NodeB-9439, test-NodeD-59078]
18:27:18,652 TRACE (VERIFY_SUSPECT.TimerThread-89,test-NodeA-39513:[]) [TCP_NIO2]
test-NodeA-39513: sending msg to null, src=test-NodeA-39513, headers are GMS:
GmsHeader[VIEW], NAKACK2: [MSG, seqno=63], TP: [cluster_name=ISPN]
18:27:18,656 TRACE (jgroups-20,test-NodeA-39513:[]) [TCP_NIO2] test-NodeA-39513: received
[dst: test-NodeA-39513, src: test-NodeB-9439 (3 headers), size=0 bytes,
flags=OOB|INTERNAL], headers are GMS: GmsHeader[VIEW_ACK], UNICAST3: DATA, seqno=100, TP:
[cluster_name=ISPN]
18:27:20,554 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
18:27:20,653 WARN (VERIFY_SUSPECT.TimerThread-89,test-NodeA-39513:[]) [GMS]
test-NodeA-39513: failed to collect all ACKs (expected=2) for view [test-NodeA-39513|11]
after 2000ms, missing 1 ACKs from (1) test-NodeD-59078
18:27:20,656 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
18:27:20,756 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
...
18:28:14,412 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
18:28:14,513 TRACE (Timer runner-1,test-NodeD-59078:[]) [NAKACK2] test-NodeD-59078:
sending XMIT_REQ ((45): {1-45}) to test-NodeA-39513
18:28:14,589 ERROR (testng-test:[]) [TestSuiteProgress] Test failed:
org.infinispan.conflict.impl.CrashedNodeDuringConflictResolutionTest.testPartitionMergePolicy[DIST_SYNC]
java.lang.RuntimeException: Cache ___defaultcache timed out waiting for rebalancing to
complete on node test-NodeA-39513, current topology is CacheTopology{id=21,
phase=CONFLICT_RESOLUTION, rebalanceId=7,
currentCH=PartitionerConsistentHash:DefaultConsistentHash{ns=256, owners =
(3)[test-NodeD-59078: 256+0, test-NodeA-39513: 0+256, test-NodeB-9439: 0+256]},
pendingCH=null, unionCH=null, actualMembers=[test-NodeD-59078, test-NodeA-39513,
test-NodeB-9439], persistentUUIDs=[828108c4-4251-49fc-9481-ff6392bea9fb,
1d4b6f07-b71b-41a1-adfb-abbe68944a9f, 3a1ece05-c282-433e-9eb5-7b3e0f1932aa]}.
rebalanceInProgress=true, currentChIsBalanced=true
at org.infinispan.test.TestingUtil.waitForNoRebalance(TestingUtil.java:392)
~[test-classes/:?]
at
org.infinispan.conflict.impl.CrashedNodeDuringConflictResolutionTest.performMerge(CrashedNodeDuringConflictResolutionTest.java:113)
~[test-classes/:?]
at
org.infinispan.conflict.impl.BaseMergePolicyTest.testPartitionMergePolicy(BaseMergePolicyTest.java:137)
~[test-classes/:?]
{noformat}
--
This message was sent by Atlassian JIRA
(v7.5.0#75005)