[jboss-svn-commits] JBL Code SVN: r19727 - in labs/jbossrules/contrib/machinelearning/decisiontree/src/dt: builder and 2 other directories.

jboss-svn-commits at lists.jboss.org jboss-svn-commits at lists.jboss.org
Sat Apr 26 00:35:13 EDT 2008


Author: gizil
Date: 2008-04-26 00:35:11 -0400 (Sat, 26 Apr 2008)
New Revision: 19727

Added:
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java
Modified:
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java
   labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java
Log:
re_training OK with a huge data sets except the java heap space error while trying to load a tree with more than 20000 facts->start boosting

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -104,11 +104,11 @@
 	public String toString(HashMap <TreeNode, ArrayList<Fact>> _facts) {
 		String out = "Facts scanned " + FACTS_READ + "\n";
 		
-		System.out.println("!!Printing tree: \n"+ Util.ntimes("\n", 3));
-		for (TreeNode obj_node : _facts.keySet())
-			System.out.println("* o.id:"+obj_node.getID()+ " o.d:"+obj_node.getDomain()+ " o.h:"+ obj_node.hashCode()+ " => "+_facts.get(obj_node) );
-		
-		System.out.println("!!Had print tree"+ Util.ntimes("\n", 3));
+//		System.out.println("!!Printing tree: \n"+ Util.ntimes("\n", 3));
+//		for (TreeNode obj_node : _facts.keySet())
+//			System.out.println("* o.id:"+obj_node.getID()+ " o.d:"+obj_node.getDomain()+ " o.h:"+ obj_node.hashCode()+ " => "+_facts.get(obj_node) );
+//		
+//		System.out.println("!!Had print tree"+ Util.ntimes("\n", 3));
 		return out + root.toString(_facts);
 	}
 

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -48,7 +48,7 @@
 	}
 	
 	
-	public void addNode(Object attributeValue, TreeNode node) {
+	public void putNode(Object attributeValue, TreeNode node) {
 		children.put(attributeValue, node);
 	}
 
@@ -135,10 +135,11 @@
 	}
 
 	public String toString(int tab, int depth, StringBuffer buf) {
-		if (depth > 0 && domain != null) {
+		//if (depth > 0 && domain != null) {
+		if (domain != null) {
 			buf.append(Util.ntimes("\t", tab));
 			buf.append(Util.ntimes("***",1));
-			buf.append( domain.getName() + " \n");
+			buf.append( domain.getName() + " n.h:"+this.hashCode()+ " \n");
 			for (Object attributeValue : children.keySet()) {
 				buf.append(Util.ntimes("\t", tab + 1));
 				buf.append("+" + attributeValue );

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -32,7 +32,7 @@
 		@Override
 		public void run() {
 			result = builder.train(dt, facts, attributeNames);
-			currentNode.addNode(value, result);
+			currentNode.putNode(value, result);
 		}
 	}
 
@@ -329,7 +329,7 @@
 		/* choosing the attribute for the branching starts */
 //		String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
 //		List<?> categorization = dt.getPossibleValues(chosenAttribute);
-		Domain<?> choosenDomain = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+		Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
 		if (Util.RUN)	System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
 
 		TreeNode currentNode = new TreeNode(choosenDomain);
@@ -355,10 +355,10 @@
 				LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
 				majorityNode.setRank(-1.0); // classifying nothing
 				majorityNode.setNumSupporter(filtered_facts.get(value).size());
-				currentNode.addNode(value, majorityNode);
+				currentNode.putNode(value, majorityNode);
 			} else {
 				TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
-				currentNode.addNode(value, newNode);
+				currentNode.putNode(value, newNode);
 			}
 		}
 
@@ -426,7 +426,7 @@
 		/* choosing the attribute for the branching starts */
 //		String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
 //		List<?> categorization = dt.getPossibleValues(chosenAttribute);
-		Domain<?> choosenDomain = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+		Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
 		if (Util.RUN)	System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
 		else if (FUNC_CALL % 100 ==0){
 			System.out.print(".");
@@ -464,7 +464,7 @@
 						
 						childNode = majorityNode; // How to set this guy
 						if (childNode == null)
-							currentNode.addNode(value, childNode);
+							currentNode.putNode(value, childNode);
 					}
 					
 					else {
@@ -481,7 +481,7 @@
 					 
 					if (childNode == null) { // there was no node assigned for that object value
 						childNode = train(dt, filtered_facts.get(value), attributeNames_copy);
-						currentNode.addNode(value, childNode);
+						currentNode.putNode(value, childNode);
 					}
 					else {
 						TreeNode newNode = re_train(dt, childNode, filtered_facts.get(value), attributeNames_copy);
@@ -506,11 +506,11 @@
 					LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
 					majorityNode.setRank(-1.0); // classifying nothing
 					majorityNode.setNumSupporter(filtered_facts.get(value).size());
-					currentNode.addNode(value, majorityNode);
+					currentNode.putNode(value, majorityNode);
 				} else {
 					
 					TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
-					currentNode.addNode(value, newNode);
+					currentNode.putNode(value, newNode);
 				}
 			}
 		}

Added: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java	                        (rev 0)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -0,0 +1,345 @@
+package dt.builder;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+
+import dt.DecisionTree;
+import dt.LeafNode;
+import dt.TreeNode;
+
+import dt.memory.Domain;
+import dt.memory.Fact;
+import dt.memory.FactDistribution;
+import dt.tools.FactProcessor;
+import dt.tools.Util;
+
+public class C45TreeIterator implements Serializable{
+	
+	/**
+	 * 
+	 */
+	private static final long serialVersionUID = 1L;
+	private C45TreeBuilder builder;
+	private HashMap <TreeNode, ArrayList<Fact>> matching_facts;
+	private int NUM_NODES;
+	
+	public C45TreeIterator(C45TreeBuilder my_builder) {
+		builder = my_builder;
+		matching_facts = new HashMap<TreeNode, ArrayList<Fact>>();
+		NUM_NODES = 0;
+	}
+	
+	/* building with the training set (some part of the facts) */
+	public DecisionTree build_to_iterate(Class<?> klass, ArrayList<Fact> first_facts) {
+		/* gets the facts which the decision tree is eligible */
+		//setKlass(klass);
+		
+		DecisionTree dt = new DecisionTree(klass.getName());
+		builder.init_dt(dt, builder.getTarget()); // initialize the decision tree with the target and all domains
+		
+		ArrayList<String> attrs = new ArrayList<String>(dt.getAttributes());
+		Collections.sort(attrs);
+		
+		builder.add_to_training(first_facts);	/* you must set this when the training called the first time */
+		dt.FACTS_READ += first_facts.size();
+
+		//while ()
+		TreeNode root = train(dt, first_facts, attrs);
+		dt.setRoot(root);
+		
+		
+		return dt;
+	}
+	
+	public TreeNode train(DecisionTree dt, ArrayList<Fact> facts, List<String> attributeNames) {
+
+		builder.FUNC_CALL++;
+		if (facts.size() == 0) {
+			throw new RuntimeException("Nothing to classify, factlist is empty");
+		}
+		/* let's get the statistics of the results */
+		// List<?> targetValues = dt.getPossibleValues(dt.getTarget());
+		//Hashtable<Object, Integer> stats_ = dt.getStatistics(facts, dt.getTarget());// targetValues
+		
+		//FactTargetDistribution stats = dt.getDistribution(facts);
+		
+		FactDistribution stats = new FactDistribution(dt.getDomain(dt.getTarget()));
+		stats.calculateDistribution(facts);
+		stats.evaluateMajority();
+
+		/* if all elements are classified to the same value */
+		if (stats.getNum_supported_target_classes() == 1) {
+
+			LeafNode classifiedNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+			classifiedNode.setRank((double) facts.size()/(double) builder.getNum_fact_trained());
+			classifiedNode.setNumSupporter(facts.size());
+			this.NUM_NODES ++;
+			classifiedNode.setID(this.NUM_NODES);
+			matching_facts.put(classifiedNode, facts);
+			
+			return classifiedNode;
+		}
+
+		/* if there is no attribute left in order to continue */
+		if (attributeNames.size() == 0) {
+			/* an heuristic of the leaf classification */
+			Object winner = stats.getThe_winner_target_class();
+			LeafNode noAttributeLeftNode = new LeafNode(dt.getDomain(dt.getTarget()), winner);
+			noAttributeLeftNode.setRank((double) stats.getVoteFor(winner)/ (double) builder.getNum_fact_trained());
+			noAttributeLeftNode.setNumSupporter(stats.getVoteFor(winner));
+			this.NUM_NODES ++;
+			noAttributeLeftNode.setID(this.NUM_NODES);
+			matching_facts.put(noAttributeLeftNode, facts);
+			
+			/* we need to know how many guys cannot be classified and who these guys are */
+			FactProcessor.splitUnclassifiedFacts(builder.getUnClassifiedFacts(), stats);
+			
+			return noAttributeLeftNode;
+		}
+
+		/* choosing the attribute for the branching starts */
+//		String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+//		List<?> categorization = dt.getPossibleValues(chosenAttribute);
+		Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
+		if (Util.RUN)	System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
+
+		TreeNode currentNode = new TreeNode(choosenDomain);
+		this.NUM_NODES ++;
+		currentNode.setID(this.NUM_NODES);
+		matching_facts.put(currentNode, facts);
+		
+		
+		Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitFacts(facts, choosenDomain);
+
+		for (Object value : filtered_facts.keySet()) {
+			if (filtered_facts.get(value).isEmpty()){
+				@SuppressWarnings("unused")
+				boolean bok = true;
+			}
+		}
+		dt.FACTS_READ += facts.size();
+
+		for (Object value : filtered_facts.keySet()) {
+			/* split the last two class at the same time */
+			ArrayList<String> attributeNames_copy = new ArrayList<String>(
+					attributeNames);
+			attributeNames_copy.remove(choosenDomain.getName());
+
+			if (filtered_facts.get(value).isEmpty()) {
+				/* majority !!!! */
+				LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+				majorityNode.setRank(-1.0); // classifying nothing
+				majorityNode.setNumSupporter(0);
+				this.NUM_NODES ++;
+				majorityNode.setID(this.NUM_NODES);
+				matching_facts.put(majorityNode, new ArrayList<Fact>());
+				currentNode.putNode(value, majorityNode);
+			} else {
+				TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+//				this.NUM_NODES ++;
+//				newNode.setID(this.NUM_NODES);
+				currentNode.putNode(value, newNode);
+			}
+		}
+
+		return currentNode;
+	}
+	
+	/* building with the training set (some part of the facts) */
+	public DecisionTree re_build(DecisionTree dt, ArrayList<Fact> new_facts) {
+		
+		ArrayList<String> attrs = new ArrayList<String>(dt.getAttributes());
+		Collections.sort(attrs);
+		
+		builder.add_to_training(new_facts);
+		dt.FACTS_READ += new_facts.size();
+		
+		//System.out.println(Util.ntimes("\n", 10)+"How facts are u training? "+ training_facts.size());
+		//while ()
+		TreeNode root = re_train(dt, dt.getRoot(), new_facts, attrs);
+		dt.setRoot(root);
+		
+		return dt;
+	}
+	
+	public TreeNode re_train(DecisionTree dt, TreeNode currentNode, ArrayList<Fact> new_facts, List<String> attributeNames) {
+
+		builder.FUNC_CALL++;
+		if (new_facts.size() == 0) {
+			throw new RuntimeException("Nothing new to classify, new fact list is empty");
+		}
+		/* let's get the statistics of the results */
+		// List<?> targetValues = dt.getPossibleValues(dt.getTarget());
+		//Hashtable<Object, Integer> stats_ = dt.getStatistics(facts, dt.getTarget());// targetValues
+		
+		//FactTargetDistribution stats = dt.getDistribution(facts);
+		ArrayList<Fact> currentFacts = matching_facts.get(currentNode);
+		currentFacts.addAll(new_facts);
+		FactDistribution stats = new FactDistribution(dt.getDomain(dt.getTarget()));
+		stats.calculateDistribution(currentFacts);
+		stats.evaluateMajority();
+
+		/* if all elements are classified to the same value */
+		if (stats.getNum_supported_target_classes() == 1) {
+			LeafNode classifiedNode;
+			if (currentNode instanceof LeafNode) {
+				classifiedNode = (LeafNode)currentNode;
+			} else {
+				
+				classifiedNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+				classifiedNode.setID(currentNode.getID());
+			}
+			classifiedNode.setRank((double) currentFacts.size()/(double) builder.getNum_fact_trained());
+			classifiedNode.setNumSupporter(currentFacts.size());
+			matching_facts.put(classifiedNode, currentFacts);//?
+			return classifiedNode;
+		}
+
+		/* if there is no attribute left in order to continue */
+		if (attributeNames.size() == 0) {
+			/* an heuristic of the leaf classification */
+			LeafNode noAttributeLeftNode;
+			Object winner = stats.getThe_winner_target_class();
+			if (currentNode instanceof LeafNode) {
+				noAttributeLeftNode = (LeafNode)currentNode;
+			} else {
+				noAttributeLeftNode = new LeafNode(dt.getDomain(dt.getTarget()), winner);
+				noAttributeLeftNode.setID(currentNode.getID());
+			}
+			noAttributeLeftNode.setRank((double) stats.getVoteFor(winner)/ (double) builder.num_fact_trained);
+			noAttributeLeftNode.setNumSupporter(stats.getVoteFor(winner));
+			
+			/* we need to know how many guys cannot be classified and who these guys are */
+			FactProcessor.splitUnclassifiedFacts(builder.getUnClassifiedFacts(), stats);
+			matching_facts.put(noAttributeLeftNode, currentFacts);
+			return noAttributeLeftNode;
+		}
+
+		/* choosing the attribute for the branching starts */
+		Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, currentFacts, stats, attributeNames);
+		if (Util.RUN)	System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
+		else if (builder.FUNC_CALL % 100 ==0){
+			System.out.print(".");
+		}
+		
+		dt.FACTS_READ += new_facts.size();
+		System.out.println(Util.ntimes("\n", 2)+"RETRAINING_DOMAINS COMP: current: "+ currentNode.getDomain() + " and choosen "+ choosenDomain + " == " + (currentNode.getDomain().equals(choosenDomain)) );
+
+		if (currentNode.getDomain().equals(choosenDomain)) {
+			
+			Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitNewFacts(new_facts, choosenDomain);
+			/* split the last two class at the same time */
+			for (Object value : filtered_facts.keySet()) {
+				
+				TreeNode childNode = currentNode.getChild(value);
+				List<Fact> matching_split = matching_facts.get(childNode);
+
+				ArrayList<String> attributeNames_copy = new ArrayList<String>(
+						attributeNames);
+				attributeNames_copy.remove(choosenDomain.getName());
+				if (childNode == null) {
+					System.out.println("the child node is null how come? ");
+					/* 
+					 * there was no node assigned for that object value
+					 * so you need to re_train for the filtered fact set
+					 */
+					/* ???????
+					childNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+					currentNode.addNode(value, childNode); */
+					System.exit(1);
+				}
+					
+				if (filtered_facts.get(value).isEmpty()) {
+					/* there is no new matching guy to that branch 
+					 * everything should stay the same 
+					 * what can change???
+					 */	
+					System.out.println("there is no new matching guy to that branch? No change in "+childNode); 
+					
+				} else {					
+					childNode = re_train(dt, childNode, filtered_facts.get(value), attributeNames_copy);
+					currentNode.putNode(value, childNode);					
+				}				
+			}
+			
+		} else {
+			/* there are two ways 
+			 * 1. i can call the train function for that set of facts and 
+			 * 		add the root to the current place
+			 */
+			/*
+			ArrayList<String> attributeNames_ = new ArrayList<String>(attributeNames);
+			//attributeNames_copy.remove(choosenDomain.getName());
+			currentNode = train(dt, currentFacts, attributeNames_);
+			*/
+			
+			/* 2. i can split the all facts according to the places i found and continue
+			 * 
+			 */
+			
+			/* before re_training on this guy you have to remove the existing fact
+			 * lists from the matching fact lists hashmap 
+			 */
+			
+			
+			int old_id = currentNode.getID();
+			remove_matching_facts(currentNode);
+			Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitFacts(currentFacts, choosenDomain);
+			currentNode = new TreeNode(choosenDomain);
+			currentNode.setID(old_id);
+			matching_facts.put(currentNode, currentFacts);
+			
+			for (Object value : filtered_facts.keySet()) {
+				/* split the last two class at the same time */
+
+				ArrayList<String> attributeNames_copy = new ArrayList<String>(attributeNames);
+				attributeNames_copy.remove(choosenDomain.getName());
+
+				if (filtered_facts.get(value).isEmpty()) {
+					/* majority !!!! */
+					LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+					majorityNode.setRank(-1.0); // classifying nothing
+					majorityNode.setNumSupporter(0);
+					this.NUM_NODES ++;
+					majorityNode.setID(this.NUM_NODES);
+					matching_facts.put(majorityNode, new ArrayList<Fact>());
+					currentNode.putNode(value, majorityNode);
+				} else {
+					TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+					currentNode.putNode(value, newNode);
+				}
+			}
+		}
+
+		return currentNode;
+	}
+	
+	private void remove_matching_facts(TreeNode current) {
+		matching_facts.remove(current);
+		//this.NUM_NODES --; /* SHOULD I */
+		for (Object childKey: current.getChildrenKeys()) {
+			remove_matching_facts(current.getChild(childKey));
+			
+		}
+	}
+
+	public int getNum_fact_trained() {
+		// TODO Auto-generated method stub
+		return builder.getNum_fact_trained();
+	}
+
+
+	public List<Integer> test(DecisionTree tree, List<Fact> facts) {
+		return builder.test( tree, facts);
+	}
+
+	public HashMap<TreeNode, ArrayList<Fact>> getMatchingFacts() {		
+		return matching_facts;
+		// TODO hide this information better
+	}
+
+}

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -34,7 +34,7 @@
 		@Override
 		public void run() {
 			result = builder.id3(dt, facts, attributeNames);
-			currentNode.addNode(value, result);
+			currentNode.putNode(value, result);
 		}
 	}
 
@@ -252,13 +252,13 @@
 				/* majority !!!! */
 				LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
 				majorityNode.setRank(0.0);
-				currentNode.addNode(value, majorityNode);
+				currentNode.putNode(value, majorityNode);
 			} else {
 //				TreeNode newNode = id3(dt, filtered_facts.get(value), attributeNames_copy);
 //				currentNode.addNode(value, newNode);
 				if (helper.isAlive()) {
 					TreeNode newNode = id3(dt, filtered_facts.get(value), attributeNames_copy);
-					currentNode.addNode(value, newNode);
+					currentNode.putNode(value, newNode);
 				}
 				else {
 					helper.attributeNames = attributeNames_copy;

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -18,7 +18,7 @@
 public class Entropy implements InformationMeasure {
 	
 	
-	public static Domain<?> chooseContAttribute(DecisionTree dt, List<Fact> facts,
+	public static Domain<?> chooseBothAttribute(DecisionTree dt, List<Fact> facts,
 			FactDistribution facts_in_class, List<String> attrs) {
 
 		double dt_info = calc_info(facts_in_class);

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -35,7 +35,7 @@
 		@Override
 		public void run() {
 			result = builder.train(dt, facts, attributeNames);
-			currentNode.addNode(value, result);
+			currentNode.putNode(value, result);
 		}
 	}
 	
@@ -200,10 +200,10 @@
 				LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
 				majorityNode.setRank(-1.0);
 				majorityNode.setNumSupporter(filtered_facts.get(value).size());
-				currentNode.addNode(value, majorityNode);
+				currentNode.putNode(value, majorityNode);
 			} else {
 				TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
-				currentNode.addNode(value, newNode);
+				currentNode.putNode(value, newNode);
 			}
 		}
 

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -2,6 +2,7 @@
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 
@@ -88,7 +89,7 @@
 		} else {
 			String str_value = (String)value;
 			
-			
+			int insertion_point = Collections.binarySearch(fValues, str_value);
 			/*
 			 * index of the search key, if it is contained in the list; otherwise, (-(insertion point) - 1). 
 			 * The insertion point is defined as the point at which the key would be inserted into the list: 
@@ -96,15 +97,16 @@
 			 * list are less than the specified key. Note that this guarantees that the return value will be >= 0 
 			 * if and only if the key is found.
 			 */
-			/*
-			int insertion_point = Collections.binarySearch(fValues, str_value, sComparator);
 			if (insertion_point >= 0) {
 				return fValues.get(insertion_point);
 			} else {
-				return fValues.get(-(insertion_point));
+				int unfound_insertion_point = -(insertion_point) -1;
+				if (unfound_insertion_point >= fValues.size()) {
+					//System.out.println("insestion point is the size domain "+this);
+					unfound_insertion_point = fValues.size() -1;  
+				}
+				return fValues.get(unfound_insertion_point);
 			}
-			*/
-			return str_value;
 		}
 		
 	}

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -100,7 +100,12 @@
 			if (insertion_point >= 0) {
 				return fValues.get(insertion_point);
 			} else {
-				return fValues.get(-(insertion_point) -1);
+				int unfound_insertion_point = -(insertion_point) -1;
+				if (unfound_insertion_point >= fValues.size()) {
+					//System.out.println("insestion point is the size domain "+this);
+					unfound_insertion_point = fValues.size() -1;  
+				}
+				return fValues.get(unfound_insertion_point);
 			}
 		}
 		
@@ -187,7 +192,10 @@
 	}
 	
 	public String toString() {
-		String out = fName;
+		String out = fName + "";
+		for (Object v: fValues) {
+			out += "-" + v;
+		}
 		return out;
 	}
 	

Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java	2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java	2008-04-26 04:35:11 UTC (rev 19727)
@@ -2,6 +2,7 @@
 
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.Comparator;
 import java.util.Hashtable;
 import java.util.Iterator;
 import java.util.List;
@@ -11,7 +12,8 @@
 import dt.memory.FactDistribution;
 
 public class FactProcessor {
-
+	
+	/* spliting during the training for C45TreeIterator */
 	public static Hashtable<Object, ArrayList<Fact>> splitFacts(ArrayList<Fact> facts, Domain<?> choosenDomain) {
 		if (choosenDomain.isDiscrete()) {	
 			return FactProcessor.splitFacts_disc(facts, choosenDomain);
@@ -108,6 +110,8 @@
 		
 		return factLists;
 	}
+	
+	/* spliting during the training for C45TreeBuilder */
 	public static Hashtable<Object, List<Fact>> splitFacts(List<Fact> facts, Domain<?> choosenDomain) {
 		if (choosenDomain.isDiscrete()) {	
 			return FactProcessor.splitFacts_disc(facts, choosenDomain);
@@ -116,8 +120,94 @@
 			return FactProcessor.splitFacts_cont_opt(facts, choosenDomain);
 		}
 	}
+	/* spliting during the re_training (only new facts) for C45TreeIterator */
+	public static Hashtable<Object, ArrayList<Fact>> splitNewFacts(ArrayList<Fact> new_facts, Domain<?> choosenDomain) {
+		if (choosenDomain.isDiscrete()) {	
+			return FactProcessor.splitFacts_disc(new_facts, choosenDomain);
+		} else {
+			Collections.sort(new_facts, choosenDomain.factComparator()); /* hack*/
+			return FactProcessor.splitNewFacts_cont_opt(new_facts, choosenDomain);
+		}
+	}
 	
-	
+	/* it must work */
+	private static Hashtable<Object, ArrayList<Fact>> splitNewFacts_cont_opt(ArrayList<Fact> facts, Domain<?> attributeDomain) {
+		
+		String attributeName = attributeDomain.getName();
+		
+		if (Util.DEBUG) System.out.println("FactProcessor.splitFacts_cont() attr_split "+ attributeName);
+		
+		List<?> splitValues = attributeDomain.getValues();
+		if (Util.DEBUG) {
+			List<Integer> splitIndices = attributeDomain.getIndices();
+			System.out.println("FactProcessor.splitFacts_cont() haniymis benim repsentativelerim: "+ splitValues.size() + " and the split points "+ splitIndices.size());
+			
+			System.out.println("FactProcessor.splitFacts_cont() before splitting "+ facts.size());
+			
+			int index = 0;
+			int split_index = 0;
+			Object attr_key = splitValues.get(split_index);
+			for (Fact f : facts) {
+				
+				if (index == splitIndices.get(split_index).intValue()+1 ) {
+					System.out.print("PRINT* (");
+					attr_key = splitValues.get(split_index+1);
+					split_index++;	
+				} else {
+					System.out.print("PRINT (");
+				}
+				System.out.println(split_index+"): fact "+f);
+				index++;
+			}
+		
+		}
+		
+		Hashtable<Object, ArrayList<Fact>> factLists = new Hashtable<Object, ArrayList<Fact>>(splitValues.size());
+		for (Object v: splitValues) {
+			factLists.put(v, new ArrayList<Fact>());
+		}
+		int begin_index = 0;
+//		Fact fact_ = facts.get(begin_index);
+		Comparator<Fact> attrComp_ = attributeDomain.factComparator();
+		int split_index = 0, last_index = 0 ;
+		Object attr_key = splitValues.get(split_index);
+		Fact pseudo = new Fact();
+		try {
+			pseudo.add(attributeDomain, attr_key);
+			for (Fact f : facts) {
+				
+				if ( attrComp_.compare(f, pseudo) <= 0) {
+					System.out.print("PRINT (");
+				} else {
+					// attrComp_.compare(f, pseudo) > 0
+					System.out.print("PRINT* (");
+					if (Util.DEBUG) {
+						System.out.println("FactProcessor.splitFacts_cont() new category: "+ attr_key );
+						System.out.println(" ("+begin_index+","+last_index+")");
+					}
+					
+					ArrayList<Fact> temp = new ArrayList<Fact>(last_index+1-begin_index+1);
+					temp.addAll(facts.subList(begin_index, last_index+1));
+					factLists.put(attr_key, temp);
+					begin_index = last_index+1;
+					
+					split_index++;
+					attr_key = splitValues.get(split_index);
+					pseudo = new Fact();
+					pseudo.add(attributeDomain, attr_key);
+					
+				}
+					
+				last_index++;
+			
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		
+		return factLists;
+	}
 	public static Hashtable<Object, List<Fact>> splitFacts_disc(List<Fact> facts, Domain<?> choosenDomain) {
 		String attributeName = choosenDomain.getName();
 		List<?> attributeValues = choosenDomain.getValues();




More information about the jboss-svn-commits mailing list