Functions for pre-processing data frames before feeding them into a decision tree etc.
Dataframe pre-processing functions
This is all "borrowed" from https://github.com/fastai/fastai/blob/master/old/fastai/structured.py
dates = pd.date_range('2000-12-14', periods=3, freq='D')
df = pd.DataFrame({'col1':[1,2,3], 'col2':['a','b','a'], 'col3date': dates, 'col4':[1.1,np.nan,None], 'col5':[None,np.nan,None]})
# for i in [1,4]: df[f'col{i}'] = pd.to_numeric(df[f'col{i}'])
print(df)
test_x, test_y, test_na_dict = proc_df(df, 'col1')
test_x, test_y, test_na_dict
# proced_df, y, na_dict = proc_df(df, 'col1')
# proc_df(df, 'col2', na_dict)
test_data = DataWrapper.from_pandas(test_x, test_y)
assert np.array_equal([0,1,2], test_data.all_x_row_idxs)
assert np.array_equal([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15], test_data.all_x_col_idxs)
assert np.array_equal(([2, 1], [2, 3]), test_data.get_sample([1,2], 0))
# pass array into sample_idxs to get 2d array back - i.e. multiple rows of data
assert test_data.x.shape == test_data.get_sample([0,1,2], None)[0].shape
# pass an into into sample_idxs to get a 1d array back - i.e. one row of data
assert test_data.x.shape[1] == test_data.get_sample(1, None)[0].shape[0]
assert test_data.x_rows == 3
test_head = test_data.head(2)
assert test_head.x_rows == 2
test_tail = test_data.tail(2)
assert test_tail.x_rows == 2
test_data = DataWrapper.from_data_wrapper(test_data, [0,2])
assert test_data.x_rows == 2
assert np.array_equal([0,1], test_data.all_x_row_idxs)
assert np.array_equal(([1, 1], [1, 3]), test_data.get_sample([0,1], 0))