[jboss-svn-commits] JBL Code SVN: r19727 - in labs/jbossrules/contrib/machinelearning/decisiontree/src/dt: builder and 2 other directories.
jboss-svn-commits at lists.jboss.org
jboss-svn-commits at lists.jboss.org
Sat Apr 26 00:35:13 EDT 2008
Author: gizil
Date: 2008-04-26 00:35:11 -0400 (Sat, 26 Apr 2008)
New Revision: 19727
Added:
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java
Modified:
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java
labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java
Log:
re_training OK with a huge data sets except the java heap space error while trying to load a tree with more than 20000 facts->start boosting
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/DecisionTree.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -104,11 +104,11 @@
public String toString(HashMap <TreeNode, ArrayList<Fact>> _facts) {
String out = "Facts scanned " + FACTS_READ + "\n";
- System.out.println("!!Printing tree: \n"+ Util.ntimes("\n", 3));
- for (TreeNode obj_node : _facts.keySet())
- System.out.println("* o.id:"+obj_node.getID()+ " o.d:"+obj_node.getDomain()+ " o.h:"+ obj_node.hashCode()+ " => "+_facts.get(obj_node) );
-
- System.out.println("!!Had print tree"+ Util.ntimes("\n", 3));
+// System.out.println("!!Printing tree: \n"+ Util.ntimes("\n", 3));
+// for (TreeNode obj_node : _facts.keySet())
+// System.out.println("* o.id:"+obj_node.getID()+ " o.d:"+obj_node.getDomain()+ " o.h:"+ obj_node.hashCode()+ " => "+_facts.get(obj_node) );
+//
+// System.out.println("!!Had print tree"+ Util.ntimes("\n", 3));
return out + root.toString(_facts);
}
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/TreeNode.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -48,7 +48,7 @@
}
- public void addNode(Object attributeValue, TreeNode node) {
+ public void putNode(Object attributeValue, TreeNode node) {
children.put(attributeValue, node);
}
@@ -135,10 +135,11 @@
}
public String toString(int tab, int depth, StringBuffer buf) {
- if (depth > 0 && domain != null) {
+ //if (depth > 0 && domain != null) {
+ if (domain != null) {
buf.append(Util.ntimes("\t", tab));
buf.append(Util.ntimes("***",1));
- buf.append( domain.getName() + " \n");
+ buf.append( domain.getName() + " n.h:"+this.hashCode()+ " \n");
for (Object attributeValue : children.keySet()) {
buf.append(Util.ntimes("\t", tab + 1));
buf.append("+" + attributeValue );
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeBuilder.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -32,7 +32,7 @@
@Override
public void run() {
result = builder.train(dt, facts, attributeNames);
- currentNode.addNode(value, result);
+ currentNode.putNode(value, result);
}
}
@@ -329,7 +329,7 @@
/* choosing the attribute for the branching starts */
// String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
// List<?> categorization = dt.getPossibleValues(chosenAttribute);
- Domain<?> choosenDomain = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+ Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
if (Util.RUN) System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
TreeNode currentNode = new TreeNode(choosenDomain);
@@ -355,10 +355,10 @@
LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
majorityNode.setRank(-1.0); // classifying nothing
majorityNode.setNumSupporter(filtered_facts.get(value).size());
- currentNode.addNode(value, majorityNode);
+ currentNode.putNode(value, majorityNode);
} else {
TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
- currentNode.addNode(value, newNode);
+ currentNode.putNode(value, newNode);
}
}
@@ -426,7 +426,7 @@
/* choosing the attribute for the branching starts */
// String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
// List<?> categorization = dt.getPossibleValues(chosenAttribute);
- Domain<?> choosenDomain = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+ Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
if (Util.RUN) System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
else if (FUNC_CALL % 100 ==0){
System.out.print(".");
@@ -464,7 +464,7 @@
childNode = majorityNode; // How to set this guy
if (childNode == null)
- currentNode.addNode(value, childNode);
+ currentNode.putNode(value, childNode);
}
else {
@@ -481,7 +481,7 @@
if (childNode == null) { // there was no node assigned for that object value
childNode = train(dt, filtered_facts.get(value), attributeNames_copy);
- currentNode.addNode(value, childNode);
+ currentNode.putNode(value, childNode);
}
else {
TreeNode newNode = re_train(dt, childNode, filtered_facts.get(value), attributeNames_copy);
@@ -506,11 +506,11 @@
LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
majorityNode.setRank(-1.0); // classifying nothing
majorityNode.setNumSupporter(filtered_facts.get(value).size());
- currentNode.addNode(value, majorityNode);
+ currentNode.putNode(value, majorityNode);
} else {
TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
- currentNode.addNode(value, newNode);
+ currentNode.putNode(value, newNode);
}
}
}
Added: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java (rev 0)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/C45TreeIterator.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -0,0 +1,345 @@
+package dt.builder;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Hashtable;
+import java.util.List;
+
+import dt.DecisionTree;
+import dt.LeafNode;
+import dt.TreeNode;
+
+import dt.memory.Domain;
+import dt.memory.Fact;
+import dt.memory.FactDistribution;
+import dt.tools.FactProcessor;
+import dt.tools.Util;
+
+public class C45TreeIterator implements Serializable{
+
+ /**
+ *
+ */
+ private static final long serialVersionUID = 1L;
+ private C45TreeBuilder builder;
+ private HashMap <TreeNode, ArrayList<Fact>> matching_facts;
+ private int NUM_NODES;
+
+ public C45TreeIterator(C45TreeBuilder my_builder) {
+ builder = my_builder;
+ matching_facts = new HashMap<TreeNode, ArrayList<Fact>>();
+ NUM_NODES = 0;
+ }
+
+ /* building with the training set (some part of the facts) */
+ public DecisionTree build_to_iterate(Class<?> klass, ArrayList<Fact> first_facts) {
+ /* gets the facts which the decision tree is eligible */
+ //setKlass(klass);
+
+ DecisionTree dt = new DecisionTree(klass.getName());
+ builder.init_dt(dt, builder.getTarget()); // initialize the decision tree with the target and all domains
+
+ ArrayList<String> attrs = new ArrayList<String>(dt.getAttributes());
+ Collections.sort(attrs);
+
+ builder.add_to_training(first_facts); /* you must set this when the training called the first time */
+ dt.FACTS_READ += first_facts.size();
+
+ //while ()
+ TreeNode root = train(dt, first_facts, attrs);
+ dt.setRoot(root);
+
+
+ return dt;
+ }
+
+ public TreeNode train(DecisionTree dt, ArrayList<Fact> facts, List<String> attributeNames) {
+
+ builder.FUNC_CALL++;
+ if (facts.size() == 0) {
+ throw new RuntimeException("Nothing to classify, factlist is empty");
+ }
+ /* let's get the statistics of the results */
+ // List<?> targetValues = dt.getPossibleValues(dt.getTarget());
+ //Hashtable<Object, Integer> stats_ = dt.getStatistics(facts, dt.getTarget());// targetValues
+
+ //FactTargetDistribution stats = dt.getDistribution(facts);
+
+ FactDistribution stats = new FactDistribution(dt.getDomain(dt.getTarget()));
+ stats.calculateDistribution(facts);
+ stats.evaluateMajority();
+
+ /* if all elements are classified to the same value */
+ if (stats.getNum_supported_target_classes() == 1) {
+
+ LeafNode classifiedNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+ classifiedNode.setRank((double) facts.size()/(double) builder.getNum_fact_trained());
+ classifiedNode.setNumSupporter(facts.size());
+ this.NUM_NODES ++;
+ classifiedNode.setID(this.NUM_NODES);
+ matching_facts.put(classifiedNode, facts);
+
+ return classifiedNode;
+ }
+
+ /* if there is no attribute left in order to continue */
+ if (attributeNames.size() == 0) {
+ /* an heuristic of the leaf classification */
+ Object winner = stats.getThe_winner_target_class();
+ LeafNode noAttributeLeftNode = new LeafNode(dt.getDomain(dt.getTarget()), winner);
+ noAttributeLeftNode.setRank((double) stats.getVoteFor(winner)/ (double) builder.getNum_fact_trained());
+ noAttributeLeftNode.setNumSupporter(stats.getVoteFor(winner));
+ this.NUM_NODES ++;
+ noAttributeLeftNode.setID(this.NUM_NODES);
+ matching_facts.put(noAttributeLeftNode, facts);
+
+ /* we need to know how many guys cannot be classified and who these guys are */
+ FactProcessor.splitUnclassifiedFacts(builder.getUnClassifiedFacts(), stats);
+
+ return noAttributeLeftNode;
+ }
+
+ /* choosing the attribute for the branching starts */
+// String chosenAttribute = Entropy.chooseContAttribute(dt, facts, stats, attributeNames);
+// List<?> categorization = dt.getPossibleValues(chosenAttribute);
+ Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, facts, stats, attributeNames);
+ if (Util.RUN) System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
+
+ TreeNode currentNode = new TreeNode(choosenDomain);
+ this.NUM_NODES ++;
+ currentNode.setID(this.NUM_NODES);
+ matching_facts.put(currentNode, facts);
+
+
+ Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitFacts(facts, choosenDomain);
+
+ for (Object value : filtered_facts.keySet()) {
+ if (filtered_facts.get(value).isEmpty()){
+ @SuppressWarnings("unused")
+ boolean bok = true;
+ }
+ }
+ dt.FACTS_READ += facts.size();
+
+ for (Object value : filtered_facts.keySet()) {
+ /* split the last two class at the same time */
+ ArrayList<String> attributeNames_copy = new ArrayList<String>(
+ attributeNames);
+ attributeNames_copy.remove(choosenDomain.getName());
+
+ if (filtered_facts.get(value).isEmpty()) {
+ /* majority !!!! */
+ LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+ majorityNode.setRank(-1.0); // classifying nothing
+ majorityNode.setNumSupporter(0);
+ this.NUM_NODES ++;
+ majorityNode.setID(this.NUM_NODES);
+ matching_facts.put(majorityNode, new ArrayList<Fact>());
+ currentNode.putNode(value, majorityNode);
+ } else {
+ TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+// this.NUM_NODES ++;
+// newNode.setID(this.NUM_NODES);
+ currentNode.putNode(value, newNode);
+ }
+ }
+
+ return currentNode;
+ }
+
+ /* building with the training set (some part of the facts) */
+ public DecisionTree re_build(DecisionTree dt, ArrayList<Fact> new_facts) {
+
+ ArrayList<String> attrs = new ArrayList<String>(dt.getAttributes());
+ Collections.sort(attrs);
+
+ builder.add_to_training(new_facts);
+ dt.FACTS_READ += new_facts.size();
+
+ //System.out.println(Util.ntimes("\n", 10)+"How facts are u training? "+ training_facts.size());
+ //while ()
+ TreeNode root = re_train(dt, dt.getRoot(), new_facts, attrs);
+ dt.setRoot(root);
+
+ return dt;
+ }
+
+ public TreeNode re_train(DecisionTree dt, TreeNode currentNode, ArrayList<Fact> new_facts, List<String> attributeNames) {
+
+ builder.FUNC_CALL++;
+ if (new_facts.size() == 0) {
+ throw new RuntimeException("Nothing new to classify, new fact list is empty");
+ }
+ /* let's get the statistics of the results */
+ // List<?> targetValues = dt.getPossibleValues(dt.getTarget());
+ //Hashtable<Object, Integer> stats_ = dt.getStatistics(facts, dt.getTarget());// targetValues
+
+ //FactTargetDistribution stats = dt.getDistribution(facts);
+ ArrayList<Fact> currentFacts = matching_facts.get(currentNode);
+ currentFacts.addAll(new_facts);
+ FactDistribution stats = new FactDistribution(dt.getDomain(dt.getTarget()));
+ stats.calculateDistribution(currentFacts);
+ stats.evaluateMajority();
+
+ /* if all elements are classified to the same value */
+ if (stats.getNum_supported_target_classes() == 1) {
+ LeafNode classifiedNode;
+ if (currentNode instanceof LeafNode) {
+ classifiedNode = (LeafNode)currentNode;
+ } else {
+
+ classifiedNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+ classifiedNode.setID(currentNode.getID());
+ }
+ classifiedNode.setRank((double) currentFacts.size()/(double) builder.getNum_fact_trained());
+ classifiedNode.setNumSupporter(currentFacts.size());
+ matching_facts.put(classifiedNode, currentFacts);//?
+ return classifiedNode;
+ }
+
+ /* if there is no attribute left in order to continue */
+ if (attributeNames.size() == 0) {
+ /* an heuristic of the leaf classification */
+ LeafNode noAttributeLeftNode;
+ Object winner = stats.getThe_winner_target_class();
+ if (currentNode instanceof LeafNode) {
+ noAttributeLeftNode = (LeafNode)currentNode;
+ } else {
+ noAttributeLeftNode = new LeafNode(dt.getDomain(dt.getTarget()), winner);
+ noAttributeLeftNode.setID(currentNode.getID());
+ }
+ noAttributeLeftNode.setRank((double) stats.getVoteFor(winner)/ (double) builder.num_fact_trained);
+ noAttributeLeftNode.setNumSupporter(stats.getVoteFor(winner));
+
+ /* we need to know how many guys cannot be classified and who these guys are */
+ FactProcessor.splitUnclassifiedFacts(builder.getUnClassifiedFacts(), stats);
+ matching_facts.put(noAttributeLeftNode, currentFacts);
+ return noAttributeLeftNode;
+ }
+
+ /* choosing the attribute for the branching starts */
+ Domain<?> choosenDomain = Entropy.chooseBothAttribute(dt, currentFacts, stats, attributeNames);
+ if (Util.RUN) System.out.println(Util.ntimes("*", 20) + " 1st best attr: "+ choosenDomain.getName());
+ else if (builder.FUNC_CALL % 100 ==0){
+ System.out.print(".");
+ }
+
+ dt.FACTS_READ += new_facts.size();
+ System.out.println(Util.ntimes("\n", 2)+"RETRAINING_DOMAINS COMP: current: "+ currentNode.getDomain() + " and choosen "+ choosenDomain + " == " + (currentNode.getDomain().equals(choosenDomain)) );
+
+ if (currentNode.getDomain().equals(choosenDomain)) {
+
+ Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitNewFacts(new_facts, choosenDomain);
+ /* split the last two class at the same time */
+ for (Object value : filtered_facts.keySet()) {
+
+ TreeNode childNode = currentNode.getChild(value);
+ List<Fact> matching_split = matching_facts.get(childNode);
+
+ ArrayList<String> attributeNames_copy = new ArrayList<String>(
+ attributeNames);
+ attributeNames_copy.remove(choosenDomain.getName());
+ if (childNode == null) {
+ System.out.println("the child node is null how come? ");
+ /*
+ * there was no node assigned for that object value
+ * so you need to re_train for the filtered fact set
+ */
+ /* ???????
+ childNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+ currentNode.addNode(value, childNode); */
+ System.exit(1);
+ }
+
+ if (filtered_facts.get(value).isEmpty()) {
+ /* there is no new matching guy to that branch
+ * everything should stay the same
+ * what can change???
+ */
+ System.out.println("there is no new matching guy to that branch? No change in "+childNode);
+
+ } else {
+ childNode = re_train(dt, childNode, filtered_facts.get(value), attributeNames_copy);
+ currentNode.putNode(value, childNode);
+ }
+ }
+
+ } else {
+ /* there are two ways
+ * 1. i can call the train function for that set of facts and
+ * add the root to the current place
+ */
+ /*
+ ArrayList<String> attributeNames_ = new ArrayList<String>(attributeNames);
+ //attributeNames_copy.remove(choosenDomain.getName());
+ currentNode = train(dt, currentFacts, attributeNames_);
+ */
+
+ /* 2. i can split the all facts according to the places i found and continue
+ *
+ */
+
+ /* before re_training on this guy you have to remove the existing fact
+ * lists from the matching fact lists hashmap
+ */
+
+
+ int old_id = currentNode.getID();
+ remove_matching_facts(currentNode);
+ Hashtable<Object, ArrayList<Fact>> filtered_facts = FactProcessor.splitFacts(currentFacts, choosenDomain);
+ currentNode = new TreeNode(choosenDomain);
+ currentNode.setID(old_id);
+ matching_facts.put(currentNode, currentFacts);
+
+ for (Object value : filtered_facts.keySet()) {
+ /* split the last two class at the same time */
+
+ ArrayList<String> attributeNames_copy = new ArrayList<String>(attributeNames);
+ attributeNames_copy.remove(choosenDomain.getName());
+
+ if (filtered_facts.get(value).isEmpty()) {
+ /* majority !!!! */
+ LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
+ majorityNode.setRank(-1.0); // classifying nothing
+ majorityNode.setNumSupporter(0);
+ this.NUM_NODES ++;
+ majorityNode.setID(this.NUM_NODES);
+ matching_facts.put(majorityNode, new ArrayList<Fact>());
+ currentNode.putNode(value, majorityNode);
+ } else {
+ TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
+ currentNode.putNode(value, newNode);
+ }
+ }
+ }
+
+ return currentNode;
+ }
+
+ private void remove_matching_facts(TreeNode current) {
+ matching_facts.remove(current);
+ //this.NUM_NODES --; /* SHOULD I */
+ for (Object childKey: current.getChildrenKeys()) {
+ remove_matching_facts(current.getChild(childKey));
+
+ }
+ }
+
+ public int getNum_fact_trained() {
+ // TODO Auto-generated method stub
+ return builder.getNum_fact_trained();
+ }
+
+
+ public List<Integer> test(DecisionTree tree, List<Fact> facts) {
+ return builder.test( tree, facts);
+ }
+
+ public HashMap<TreeNode, ArrayList<Fact>> getMatchingFacts() {
+ return matching_facts;
+ // TODO hide this information better
+ }
+
+}
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/DecisionTreeBuilderMT.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -34,7 +34,7 @@
@Override
public void run() {
result = builder.id3(dt, facts, attributeNames);
- currentNode.addNode(value, result);
+ currentNode.putNode(value, result);
}
}
@@ -252,13 +252,13 @@
/* majority !!!! */
LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
majorityNode.setRank(0.0);
- currentNode.addNode(value, majorityNode);
+ currentNode.putNode(value, majorityNode);
} else {
// TreeNode newNode = id3(dt, filtered_facts.get(value), attributeNames_copy);
// currentNode.addNode(value, newNode);
if (helper.isAlive()) {
TreeNode newNode = id3(dt, filtered_facts.get(value), attributeNames_copy);
- currentNode.addNode(value, newNode);
+ currentNode.putNode(value, newNode);
}
else {
helper.attributeNames = attributeNames_copy;
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/Entropy.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -18,7 +18,7 @@
public class Entropy implements InformationMeasure {
- public static Domain<?> chooseContAttribute(DecisionTree dt, List<Fact> facts,
+ public static Domain<?> chooseBothAttribute(DecisionTree dt, List<Fact> facts,
FactDistribution facts_in_class, List<String> attrs) {
double dt_info = calc_info(facts_in_class);
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/builder/IDTreeBuilder.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -35,7 +35,7 @@
@Override
public void run() {
result = builder.train(dt, facts, attributeNames);
- currentNode.addNode(value, result);
+ currentNode.putNode(value, result);
}
}
@@ -200,10 +200,10 @@
LeafNode majorityNode = new LeafNode(dt.getDomain(dt.getTarget()), stats.getThe_winner_target_class());
majorityNode.setRank(-1.0);
majorityNode.setNumSupporter(filtered_facts.get(value).size());
- currentNode.addNode(value, majorityNode);
+ currentNode.putNode(value, majorityNode);
} else {
TreeNode newNode = train(dt, filtered_facts.get(value), attributeNames_copy);
- currentNode.addNode(value, newNode);
+ currentNode.putNode(value, newNode);
}
}
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/LiteralDomain.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -2,6 +2,7 @@
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.Comparator;
import java.util.List;
@@ -88,7 +89,7 @@
} else {
String str_value = (String)value;
-
+ int insertion_point = Collections.binarySearch(fValues, str_value);
/*
* index of the search key, if it is contained in the list; otherwise, (-(insertion point) - 1).
* The insertion point is defined as the point at which the key would be inserted into the list:
@@ -96,15 +97,16 @@
* list are less than the specified key. Note that this guarantees that the return value will be >= 0
* if and only if the key is found.
*/
- /*
- int insertion_point = Collections.binarySearch(fValues, str_value, sComparator);
if (insertion_point >= 0) {
return fValues.get(insertion_point);
} else {
- return fValues.get(-(insertion_point));
+ int unfound_insertion_point = -(insertion_point) -1;
+ if (unfound_insertion_point >= fValues.size()) {
+ //System.out.println("insestion point is the size domain "+this);
+ unfound_insertion_point = fValues.size() -1;
+ }
+ return fValues.get(unfound_insertion_point);
}
- */
- return str_value;
}
}
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/memory/NumericDomain.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -100,7 +100,12 @@
if (insertion_point >= 0) {
return fValues.get(insertion_point);
} else {
- return fValues.get(-(insertion_point) -1);
+ int unfound_insertion_point = -(insertion_point) -1;
+ if (unfound_insertion_point >= fValues.size()) {
+ //System.out.println("insestion point is the size domain "+this);
+ unfound_insertion_point = fValues.size() -1;
+ }
+ return fValues.get(unfound_insertion_point);
}
}
@@ -187,7 +192,10 @@
}
public String toString() {
- String out = fName;
+ String out = fName + "";
+ for (Object v: fValues) {
+ out += "-" + v;
+ }
return out;
}
Modified: labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java
===================================================================
--- labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java 2008-04-26 02:02:44 UTC (rev 19726)
+++ labs/jbossrules/contrib/machinelearning/decisiontree/src/dt/tools/FactProcessor.java 2008-04-26 04:35:11 UTC (rev 19727)
@@ -2,6 +2,7 @@
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Comparator;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
@@ -11,7 +12,8 @@
import dt.memory.FactDistribution;
public class FactProcessor {
-
+
+ /* spliting during the training for C45TreeIterator */
public static Hashtable<Object, ArrayList<Fact>> splitFacts(ArrayList<Fact> facts, Domain<?> choosenDomain) {
if (choosenDomain.isDiscrete()) {
return FactProcessor.splitFacts_disc(facts, choosenDomain);
@@ -108,6 +110,8 @@
return factLists;
}
+
+ /* spliting during the training for C45TreeBuilder */
public static Hashtable<Object, List<Fact>> splitFacts(List<Fact> facts, Domain<?> choosenDomain) {
if (choosenDomain.isDiscrete()) {
return FactProcessor.splitFacts_disc(facts, choosenDomain);
@@ -116,8 +120,94 @@
return FactProcessor.splitFacts_cont_opt(facts, choosenDomain);
}
}
+ /* spliting during the re_training (only new facts) for C45TreeIterator */
+ public static Hashtable<Object, ArrayList<Fact>> splitNewFacts(ArrayList<Fact> new_facts, Domain<?> choosenDomain) {
+ if (choosenDomain.isDiscrete()) {
+ return FactProcessor.splitFacts_disc(new_facts, choosenDomain);
+ } else {
+ Collections.sort(new_facts, choosenDomain.factComparator()); /* hack*/
+ return FactProcessor.splitNewFacts_cont_opt(new_facts, choosenDomain);
+ }
+ }
-
+ /* it must work */
+ private static Hashtable<Object, ArrayList<Fact>> splitNewFacts_cont_opt(ArrayList<Fact> facts, Domain<?> attributeDomain) {
+
+ String attributeName = attributeDomain.getName();
+
+ if (Util.DEBUG) System.out.println("FactProcessor.splitFacts_cont() attr_split "+ attributeName);
+
+ List<?> splitValues = attributeDomain.getValues();
+ if (Util.DEBUG) {
+ List<Integer> splitIndices = attributeDomain.getIndices();
+ System.out.println("FactProcessor.splitFacts_cont() haniymis benim repsentativelerim: "+ splitValues.size() + " and the split points "+ splitIndices.size());
+
+ System.out.println("FactProcessor.splitFacts_cont() before splitting "+ facts.size());
+
+ int index = 0;
+ int split_index = 0;
+ Object attr_key = splitValues.get(split_index);
+ for (Fact f : facts) {
+
+ if (index == splitIndices.get(split_index).intValue()+1 ) {
+ System.out.print("PRINT* (");
+ attr_key = splitValues.get(split_index+1);
+ split_index++;
+ } else {
+ System.out.print("PRINT (");
+ }
+ System.out.println(split_index+"): fact "+f);
+ index++;
+ }
+
+ }
+
+ Hashtable<Object, ArrayList<Fact>> factLists = new Hashtable<Object, ArrayList<Fact>>(splitValues.size());
+ for (Object v: splitValues) {
+ factLists.put(v, new ArrayList<Fact>());
+ }
+ int begin_index = 0;
+// Fact fact_ = facts.get(begin_index);
+ Comparator<Fact> attrComp_ = attributeDomain.factComparator();
+ int split_index = 0, last_index = 0 ;
+ Object attr_key = splitValues.get(split_index);
+ Fact pseudo = new Fact();
+ try {
+ pseudo.add(attributeDomain, attr_key);
+ for (Fact f : facts) {
+
+ if ( attrComp_.compare(f, pseudo) <= 0) {
+ System.out.print("PRINT (");
+ } else {
+ // attrComp_.compare(f, pseudo) > 0
+ System.out.print("PRINT* (");
+ if (Util.DEBUG) {
+ System.out.println("FactProcessor.splitFacts_cont() new category: "+ attr_key );
+ System.out.println(" ("+begin_index+","+last_index+")");
+ }
+
+ ArrayList<Fact> temp = new ArrayList<Fact>(last_index+1-begin_index+1);
+ temp.addAll(facts.subList(begin_index, last_index+1));
+ factLists.put(attr_key, temp);
+ begin_index = last_index+1;
+
+ split_index++;
+ attr_key = splitValues.get(split_index);
+ pseudo = new Fact();
+ pseudo.add(attributeDomain, attr_key);
+
+ }
+
+ last_index++;
+
+ }
+ } catch (Exception e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+
+ return factLists;
+ }
public static Hashtable<Object, List<Fact>> splitFacts_disc(List<Fact> facts, Domain<?> choosenDomain) {
String attributeName = choosenDomain.getName();
List<?> attributeValues = choosenDomain.getValues();
More information about the jboss-svn-commits
mailing list