Decision Tree

assert np.inf == Node(1, 0.9, [1,2,3]).split_score

test_x = np.array(
    [[23.2, 44.4], #0
     [ 2. ,  2. ], #1
     [34.3, 77.3], #2
     [-1.5, -0.5], #3
     [ 1.5,  1.5], #4
     [ 1.5,  9.2], #5
     [ 2. , -2. ]])#6
test_y = np.array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5, 6.6])
test_data = DataWrapper.from_pandas(pd.DataFrame(test_x), pd.Series(test_y))

test_node = Node(0, 0, np.arange(7))
best_split_for_col(test_data, test_node, 0, 3)
le_split, gt_split = test_node.split_idxs
assert np.array_equal(test_y[le_split], [3.3, 4.4, 5.5])
assert np.array_equal(test_y[gt_split], [1.1, 6.6, 0. , 2.2])

test_node = Node(0, 0, np.arange(7))
best_split_for_col(test_data, test_node, 1)
le_split, gt_split = test_node.split_idxs
assert np.array_equal(test_y[le_split], [6.6])
assert np.array_equal(test_y[gt_split], [3.3, 4.4, 1.1, 5.5, 0. , 2.2])

# not enough data to split with at least 4 values in each leaf
test_node = Node(0, 0, np.arange(7))
best_split_for_col(test_data, test_node, 1, 4)
assert test_node.split_score == np.inf

test_node = Node(0, 0, np.arange(7)[4:])
test_split = best_split_for_col(test_data, test_node, 0, 1)
le_split, gt_split = test_node.split_idxs
assert np.array_equal(test_y[le_split], [4.4, 5.5])
assert np.array_equal(test_y[gt_split], [6.6])
assert test_node.split_values == (1.5, 2.0)

test_node = Node(0, 0, np.arange(7))
best_split(test_data, test_node, [0,1]); test_split
assert test_node.split_col_idx == 1
assert test_node.split_preds == (6.6, 2.75)

print(test_y)
print_tree(DecisionTree(test_data, min_leaf_samples=2).fit())

[0.  1.1 2.2 3.3 4.4 5.5 6.6]
dTree(data=DataWrapper(x:[0 1] y:y, len:7) max_depth=None min_leaf_samples=2)
col_idxs_fn default
Node(1, 3.3, [0 1 2 3 4 5 6], 11.663, 0, (2.0, 23.2), (array([3, 4, 5, 1, 6]), array([0, 2])), (4.18, 1.1))
Node(2, 4.18, [3 4 5 1 6], 8.194, 0, (1.5, 2.0), (array([3, 4, 5]), array([1, 6])), (4.4, 3.85))
Node(2, 1.1, [0 2])
Node(3, 4.4, [3 4 5])
Node(3, 3.85, [1 6])

print_tree(DecisionTree(test_data, min_leaf_samples=1).fit())

dTree(data=DataWrapper(x:[0 1] y:y, len:7) max_depth=None min_leaf_samples=1)
col_idxs_fn default
Node(1, 3.3, [0 1 2 3 4 5 6], 11.272, 1, (-2.0, -0.5), (array([6]), array([3, 4, 1, 5, 0, 2])), (6.6, 2.75))
Node(2, 6.6, [6])
Node(2, 2.75, [3 4 1 5 0 2], 5.389, 0, (1.5, 2.0), (array([3, 4, 5]), array([1, 0, 2])), (4.4, 1.1))
Node(3, 4.4, [3 4 5], 1.1, 1, (1.5, 9.2), (array([3, 4]), array([5])), (3.85, 5.5))
Node(3, 1.1, [1 0 2], 1.1, 0, (23.2, 34.3), (array([1, 0]), array([2])), (0.55, 2.2))
Node(4, 3.85, [3 4], 0, 0, (-1.5, 1.5), (array([3]), array([4])), (3.3, 4.4))
Node(4, 5.5, [5])
Node(4, 0.55, [1 0], 0, 0, (2.0, 23.2), (array([1]), array([0])), (1.1, 0.0))
Node(4, 2.2, [2])
Node(5, 3.3, [3])
Node(5, 4.4, [4])
Node(5, 1.1, [1])
Node(5, 0.0, [0])

TODO: xxxx clean-up

Might we be able to generalize better by ;

adding a little randomness by using a split value that lies somewhere between the lower and upper boundary of the split. See np.random.uniform ...
use the average of the lower and upper boundary values

both of these could be done at prediction time

When we make a prediction for a row that was used in training and we have only 1 sample in each leaf, the tree should predict exactly the right answer

grab a single row of data
make a prediction for this row
assert that the prediction we made matches the actual for this row

test_tree = DecisionTree(test_data, min_leaf_samples=1).fit()
for i in range(test_data.x_rows):
    test_sample = test_data.get_sample(i)
    assert predict_row(test_sample[0], test_tree.node) == test_sample[1]

Set-up some data for testing. This data is copied from the final model used in https://github.com/fastai/fastai/tree/master/courses/ml1/lesson2-rf_interpretation.ipynb

bulldozers_data = np.load('test/data/bulldozers.npy', allow_pickle=True)
train_data = DataWrapper(*bulldozers_data[:4])
valid_data = DataWrapper(*bulldozers_data[4:])
train_data, valid_data

(DataWrapper(x:['YearMade' 'Coupler_System' 'ProductSize' 'fiProductClassDesc' 'ModelID'
  'saleElapsed' 'fiSecondaryDesc' 'fiModelDesc' 'Enclosure'
  'fiModelDescriptor' 'Hydraulics_Flow' 'Drive_System' 'ProductGroup'
  'Track_Type' 'state' 'saleDay' 'ProductGroupDesc' 'age'] y:SalePrice, len:389125),
 DataWrapper(x:['YearMade' 'Coupler_System' 'ProductSize' 'fiProductClassDesc' 'ModelID'
  'saleElapsed' 'fiSecondaryDesc' 'fiModelDesc' 'Enclosure'
  'fiModelDescriptor' 'Hydraulics_Flow' 'Drive_System' 'ProductGroup'
  'Track_Type' 'state' 'saleDay' 'ProductGroupDesc' 'age'] y:SalePrice, len:12000))

Use a very small amount of data to train a decision tree, then print the root node so we can see how the data has been split.

It's interesting that the depth of this tree is greater than the expected np.log2(test_tree.data.x_rows) - because it's unbalanced.

test_tree = DecisionTree(train_data.tail(10), min_leaf_samples=1).fit()
print_tree(test_tree)

dTree(data=DataWrapper(x:['YearMade' 'Coupler_System' 'ProductSize' 'fiProductClassDesc' 'ModelID'
 'saleElapsed' 'fiSecondaryDesc' 'fiModelDesc' 'Enclosure'
 'fiModelDescriptor' 'Hydraulics_Flow' 'Drive_System' 'ProductGroup'
 'Track_Type' 'state' 'saleDay' 'ProductGroupDesc' 'age'] y:SalePrice, len:10) max_depth=None min_leaf_samples=1)
col_idxs_fn default
Node(1, 10.1, [0 1 2 3 4 5 6 7 8 9], 2.348, 6, (48, 107), (array([8, 9, 0]), array([2, 3, 7, 1, 4, 5, 6])), (9.263, 10.459))
Node(2, 9.263, [8 9 0], 0.105, 13, (0, 1), (array([0]), array([9, 8])), (9.473, 9.158))
Node(2, 10.459, [2 3 7 1 4 5 6], 1.013, 9, (0, 9), (array([2, 3, 7, 4, 5]), array([6, 1])), (10.316, 10.815))
Node(3, 9.473, [0])
Node(3, 9.158, [9 8], 0, 5, (1271894400, 1278979200), (array([9]), array([8])), (9.105, 9.21))
Node(3, 10.316, [2 3 7 4 5], 0.394, 4, (209, 216), (array([2]), array([3, 7, 5, 4])), (10.043, 10.384))
Node(3, 10.815, [6 1], 0, 0, (2003, 2007), (array([6]), array([1])), (10.714, 10.915))
Node(4, 9.105, [9])
Node(4, 9.21, [8])
Node(4, 10.043, [2])
Node(4, 10.384, [3 7 5 4], 0.105, 4, (11980, 28657), (array([3, 7, 5]), array([4])), (10.438, 10.222))
Node(4, 10.714, [6])
Node(4, 10.915, [1])
Node(5, 10.438, [3 7 5], 0.0, 2, (0, 6), (array([3, 7]), array([5])), (10.463, 10.389))
Node(5, 10.222, [4])
Node(6, 10.463, [3 7], 0, 5, (1313452800, 1313539200), (array([7]), array([3])), (10.463, 10.463))
Node(6, 10.389, [5])
Node(7, 10.463, [7])
Node(7, 10.463, [3])

Get predictions for all of the data we trained on.

Although we set min_leaf_samples=1, not every sample has it's own leaf. If 2 or more samples have;

the same values for all independent variables and
different values for the dependent variable,
they will end up in the same leaf (because we can't find a value to split on) that will predict the mean of the dependent variables for all samples in the leaf

So we expect preds to be nearly 100% correct;

loss to be nearly zero
predictions vs actual plots a strait line with just a little variation

test_tree = DecisionTree(train_data.tail(2000), min_leaf_samples=1).fit()
test_preds = test_tree.predict(test_tree.data.x)
loss = rmse(test_preds, test_tree.data.y); print('loss', loss)
import matplotlib.pyplot as plt
plt.scatter(test_preds, test_tree.data.y, alpha=.1);

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-86d72a994897> in <module>
----> 1 test_tree = DecisionTree(train_data.tail(2000), min_leaf_samples=1).fit()
      2 test_preds = test_tree.predict(test_tree.data.x)
      3 loss = rmse(test_preds, test_tree.data.y); print('loss', loss)
      4 import matplotlib.pyplot as plt
      5 plt.scatter(test_preds, test_tree.data.y, alpha=.1);

NameError: name 'train_data' is not defined

Get predictions for all of the data we trained on again - but allow a minimum of 5 items in each leaf. So we see;

a non-zero loss
some variance in the predictions vs actual plot

test_tree = DecisionTree(train_data.tail(2000), min_leaf_samples=5).fit()
test_preds = test_tree.predict(test_tree.data.x)
loss = rmse(test_preds, test_tree.data.y); print('loss', loss)
plt.scatter(test_preds, test_tree.data.y, alpha=.1);

loss 0.20120633092246643

Get predictions for the validation data - we don't expect a single tree to be very good

test_preds = test_tree.predict(valid_data.x)
loss = rmse(test_preds, valid_data.y); print('loss', loss)
plt.scatter(test_preds, valid_data.y, alpha=.1);

loss 0.5382252356612661

Tree Ensemble (AKA Random Forest)

Create a tree ensemble and check that it has initialized correctly

test_ensemble = TreeEnsemble(train_data.tail(2000), sample_size=750, min_leaf_samples=5)
assert test_ensemble.sample_size == 750 == test_ensemble.trees[0].data.x_rows == len(test_ensemble.trees[0].data.y)
assert len(test_ensemble.col_idxs_fn(test_ensemble.data.all_x_col_idxs)) == 9
assert len(test_ensemble.trees) == 10

Fit the ensemble and get predictions for all of the data we trained on - TODO: I'd expect this to be better than a single tree but the loss is the same.

test_ensemble.fit()
test_preds = test_ensemble.predict(test_ensemble.data.x)
loss = rmse(test_preds, test_ensemble.data.y); print('loss', loss)
plt.scatter(test_preds, test_ensemble.data.y, alpha=.1);

loss 0.25215256357954263

Get predictions for the validation data - expect this to be better than a single tree.

test_preds = test_ensemble.predict(valid_data.x)
loss = rmse(test_preds, valid_data.y); print('loss', loss)
plt.scatter(test_preds, valid_data.y, alpha=.1);

loss 0.5288661288172435

TODO

add classification capability

confidence based on tree pred variance
feature importance
- jumble single column -> create preds - which column makes preds the worst when jumbled
- WHAT are you forgetting?
  - is it which split/feature contributes the biggest change from "bias" to "pred"
- avg depth of feature in tree <- i just made this up
show dendogram of rank correlation
partial dependence
- ggplot if monotonic relationship
- do the "what if" preds - i.e. change year of sale to 1960 and see what things would have sold for
- pdp plot
tree interpret
- for single row pred: print contribution of each split (feature) to final result
- waterfall chart

Models

Decision Tree

`class` `Node`[source]

`best_split_for_col`[source]

`best_split`[source]

`class` `DecisionTree`[source]

`print_tree`[source]

`predict_row`[source]

Tree Ensemble (AKA Random Forest)

`class` `TreeEnsemble`[source]

Models

Decision Tree

class Node[source]

best_split_for_col[source]

best_split[source]

class DecisionTree[source]

print_tree[source]

predict_row[source]

Tree Ensemble (AKA Random Forest)

class TreeEnsemble[source]

`class` `Node`[source]

`best_split_for_col`[source]

`best_split`[source]

`class` `DecisionTree`[source]

`print_tree`[source]

`predict_row`[source]

`class` `TreeEnsemble`[source]