diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index fb076c38..00000000 Binary files a/.DS_Store and /dev/null differ diff --git a/README.md b/README.md index 986c863e..5e9469a5 100644 --- a/README.md +++ b/README.md @@ -1,37 +1,39 @@ -# DOI: https://zenodo.org/record/2579240 + https://zenodo.org/record/2579240 +# TreeSurgeon - Visualisation of Radom Forest Regressor models +**TreeSurgeon** contains routines to visualise Radom Forest Regressor models. The module takes models output files made by [`sklearn`](https://scikit-learn.org/)'s RadomForestRegressor implementation of the random forest regressor algorithm. The raw output files from [`sklearn`](https://scikit-learn.org/) models (`*pkl`) first needs to be converted to the input `.csv` files required by **TreeSurgeon** using the +`extract_models4TreeSurgeon.py` script in the +[`sparse2spatial`](https://github.com/tsherwen/sparse2spatial) module. -# Written for usage in: +# Quick Start -## "A machine learning based global sea-surface iodide distribution" +## Running -#### Authors: -Tomás Sherwen (1,2), Rosie J. Chance (2), Liselotte Tinel (2), Daniel Ellis (2), Mat J. Evans (1,2), and Lucy J. Carpenter (2) +- Process the saved Radom Forest Regressor models `*.pkl` files into the `.csv` that **TreeSurgeon*** expects using the script in [`sparse2spatial`](https://github.com/tsherwen/sparse2spatial) module. You will need to update some lines in the script as described there. +`python extract_models4TreeSurgeon.py` -(1) National Centre for Atmospheric Science, University of York, York, YO10 5DD, UK -(2) Wolfson Atmospheric Chemistry Laboratories, University of York, York, YO10 5DD, UK +- Place files in the [`csv`](https://github.com/wolfiex/TreeSurgeon/tree/master/csv) folder. -#### Citation: -Sherwen, T., Chance, R. J., Tinel, L., Ellis, D., Evans, M. J., and Carpenter, L. J.: A machine learning based global sea-surface iodide distribution, Earth Syst. Sci. Data Discuss., https://doi.org/10.5194/essd-2019-40, in review, 2019. +for composite files: + +`python start.py $NCPUS` -# Running -Place files in csv folder. +or for single dot files -`python start.py $NCPUS` -for composite files -`python start.py $NCPUS 1 ` -for single dot files +`python start.py $NCPUS 1 ` -This then runs in the background (no screen). To change edit 'show' option in main.js +- This then runs in the background (no screen). To change edit `show` option in main.js -# Set colours -see colours.json file +## Set colours +The colours are set in the `colours.json` file. -# Output -This is in the pdf folder. +## Output +This is in the [`pdfs`](https://github.com/wolfiex/TreeSurgeon/tree/master/pdfs) folder. -# Install +## Install ``` conda install nodejs npm install @@ -40,13 +42,17 @@ sudo npm install -g --save electron --unsafe-perm=true --allow-root - for merge - have imagemagick and ghostscript installed - -# Montage setup +## Montage setup python montage.py - - ## Example Output for Composite Graph +# Usage + +This package was initially written for use with the [`sparse2spatial`](https://github.com/tsherwen/sparse2spatial) package for work to predict sea-surface concentrations [[*Sherwen et al.* 2019](https://doi.org/10.5194/essd-2019-40)]. However it can be used for any Radom Forest Regressor models made by [`sklearn`](https://scikit-learn.org/) and post-processed to **TreeSurgeon** input by [`sparse2spatial`](https://github.com/tsherwen/sparse2spatial) + + +## Reference +Sherwen, T., Chance, R. J., Tinel, L., Ellis, D., Evans, M. J., and Carpenter, L. J.: A machine learning based global sea-surface iodide distribution, Earth Syst. Sci. Data Discuss., https://doi.org/10.5194/essd-2019-40, in review, 2019. diff --git a/process_sklearn_models2csv_files.py b/process_sklearn_models2csv_files.py deleted file mode 100644 index 99c73aa3..00000000 --- a/process_sklearn_models2csv_files.py +++ /dev/null @@ -1,365 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- -""" -Function to process sklearn saved RandomForestRegressors to csv files, -which can then be read in by forrester's nope.js plotter functions. - -NOTE: - - The function get_RFR_dictionary is just pseudo code. It will need to updated by the user to provide a dictionary of values/models etc required by the other models provided here. The dictionary values required are stated in get_RFR_dictionary - -""" -from __future__ import print_function -import os -import glob -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt - - -def main(): - """ - Driver to make summary csv files from sklearn RandomForestRegressor models - """ - - # Get dictionaries of feature variables, model names etc... - RFR_dict = get_RFR_dictionary() - - # Extract the pickled sklearn RandomForestRegressor models to .dot files - extract_trees_to_dot_files() - - # Analyse the nodes in the models - # (This calls the main worker function "get_decision_point_and_values_for_tree") - analyse_nodes_in_models( RFR_dict=RFR_dict ) - - -def get_RFR_dictionary(): - """ - Read in RandomForestRegressor variables - - Returns - ------- - (dict) - - Notes - ------- - - This is just pseudo code listing the vaiables that are required to be in the - dictionary - """ - # Setup a dictionary object - RFR_dict = {} - # Add model names and models - # RFR_dict['models_dict'] = {'name of model': model, ...} - # Add testing features for models - # RFR_dict['testing_features_dict'] = {'name of model': testing features of model, ...} - # Add list of the topmodels (models to analyse) - # RFR_dict['topmodels'] = [...] - - return RFR_dict - - -def extract_trees_to_dot_files(folder=None, plot_tree=False, - Name_of_model='Example_model', testing_features=None, max_depth=None): - """ - Extract model trees to .dot files to be plotted in d3 - - Parameters - ------- - folder (str): the folder that the model output *.pkl files are - testing_features (list): list of the testing features in a given model - Name_of_model (str): Name of model in filename, used in read and saving - plot_tree (boolean): plot up the extracted tree - max_depth (int): depth up to which to extract - - Returns - ------- - (None) - """ - from sklearn.externals import joblib - from sklearn import tree - import os - # Get the location of the saved model - model_filename = "my_model_{}.pkl".format( extr_str ) - # open as random forst object ("rf") - rf = joblib.load(folder+model_filename) - # loop trees in forest and save to disk - for n, rf_unit in enumerate( rf ): - out_file='tree_{}_{}.dot'.format( Name_of_model, n ) - tree.export_graphviz(rf_unit, out_file=out_file, max_depth=max_depth, - feature_names=testing_features ) - # Also plot up? - if plot_tree: - os.system('dot -Tpng tree.dot -o tree.png') - - -def analyse_nodes_in_models( RFR_dict=None, depth2investigate=5 ): - """ - Analyse the nodes in a RFR model - - Parameters - ------- - RFR_dict (dictionary): diction of models, model names, features etc - (see get_RFR_dictionary function) - depth2investigate (int): depth up to which to build statistics on - - Returns - ------- - (None) - """ - import glob - # models to analyse? - models2compare = [ ] - topmodels = RFR_dict['topmodels'] - models2compare = topmodels - # Loop and analyse models2compare - for model_name in models2compare: - print( model_name ) - get_decision_point_and_values_for_tree( model_name=model_name, - RFR_dict=RFR_dict, depth2investigate=depth2investigate ) - # Loop and update the variable names - for model_name in models2compare: - print( model_name ) - # Now rename variables in columns - filestr = 'Oi_prj_features_of*{}*{}*.csv' - filestr = filestr.format( model_name, depth2investigate ) - csv_files = glob.glob(filestr) - for csv_file in csv_files: - df = pd.read_csv( csv_file ) - # save the .csv - df.to_csv( csv_file ) - - -def get_decision_point_and_values_for_tree( depth2investigate=3, - model_name='RFR(TEMP+DEPTH+SAL)', RFR_dict=None, verbose=True, - debug=False ): - """ - Get the variables driving decisions at each point - - NOTE: - link: http://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html -# The decision estimator has an attribute called tree_ which stores the entire -# tree structure and allows access to low level attributes. The binary tree -# tree_ is represented as a number of parallel arrays. The i-th element of each -# array holds information about the node `i`. Node 0 is the tree's root. NOTE: -# Some of the arrays only apply to either leaves or split nodes, resp. In this -# case the values of nodes of the other type are arbitrary! -# -# Among those arrays, we have: -# - left_child, id of the left child of the node -# - right_child, id of the right child of the node -# - feature, feature used for splitting the node -# - threshold, threshold value at the node -# - """ - from sklearn.externals import joblib - from sklearn import tree - import os - # extra variables needed from RFR_dict - models_dict = RFR_dict['models_dict'] - testing_features_dict = RFR_dict['testing_features_dict'] - # Extract model from dictionary - model = models_dict[ model_name ] - # Get training_features - training_features = testing_features_dict[ model_name ].split('+') - # Core string for saving data to. - filename_str = 'Oi_prj_features_of_{}_for_depth_{}{}.{}' - # Intialise a DataFrame to store values in - df = pd.DataFrame() - # Loop by estimator in model - for n_estimator, estimator in enumerate( model ): - # Extract core variables of interest - n_nodes = estimator.tree_.node_count - children_left = estimator.tree_.children_left - children_right = estimator.tree_.children_right - feature = estimator.tree_.feature - threshold = estimator.tree_.threshold - n_node_samples = estimator.tree_.n_node_samples - # The tree structure can be traversed to compute various properties such - # as the depth of each node and whether or not it is a leaf. - node_depth = np.zeros(shape=n_nodes, dtype=np.int64) - is_leaves = np.zeros(shape=n_nodes, dtype=bool) - stack = [(0, -1)] # seed is the root node id and its parent depth - # Now extract data - while len(stack) > 0: - node_id, parent_depth = stack.pop() - node_depth[node_id] = parent_depth + 1 - # If we have a test node - if (children_left[node_id] != children_right[node_id]): - stack.append((children_left[node_id], parent_depth + 1)) - stack.append((children_right[node_id], parent_depth + 1)) - else: - is_leaves[node_id] = True - # - work out which nodes are required. - # NOTE: numbering is 1=># of nodes (zero is the first node) - # add the initial node to a dictionary - nodes2save = {} - depth = 0 - n_node = 0 - nodes2save[ depth ] = { n_node: [children_left[0], children_right[0]] } - num2node = {0:0} - # For depth in depths - for depth in range( depth2investigate )[:-1]: - nodes4depth = {} - new_n_node = max( nodes2save[ depth ].keys() )+1 - for n_node in nodes2save[ depth ].keys(): - # Get nodes from the children of each node (LH + RH) - for ChildNum in nodes2save[ depth ][ n_node ]: - # Get the children of this node - LHnew = children_left[ ChildNum ] - RHnew = children_right[ ChildNum ] - # save to temp. dict - nodes4depth[ new_n_node ] = [ LHnew, RHnew ] - # increment the counter and - new_n_node += 1 - # Save the new nodes for depth with assigned number - nodes2save[ depth+1 ] = nodes4depth - # Get node numbers to save as a dict - for d in range( depth2investigate )[1:]: - if debug: print ( d, nodes2save[d] ) - for n in nodes2save[d-1].keys(): - if debug: print( n, nodes2save[d-1][n] ) - for nn in nodes2save[d-1][n] : - newnum = max( num2node.keys() ) +1 - num2node[ newnum ] = nn - # Make a series of values for estimators - s = pd.Series() - for node_num in sorted( num2node.keys() ): - # get index of node of interest - idx = num2node[node_num] - # save threadhold value - var_ = 'N{:0>4}: threshold '.format( node_num ) - s[var_] = threshold[ idx ] - # save feature (and convert index to variable name) - var_ = 'N{:0>4}: feature '.format( node_num ) - s[var_] = training_features[ feature[ idx ] ] - # save feature (and convert index to variable name) - var_ = 'N{:0>4}: n_node_samples '.format( node_num ) - s[var_] = n_node_samples[ idx ] - # save right hand children - var_ = 'N{:0>4}: RH child '.format( node_num ) - s[var_] = children_right[ idx ] - # save the left hand children - var_ = 'N{:0>4}: LH child '.format( node_num ) - s[var_] = children_left[ idx ] - # Also add general details for estimator - s['n_nodes'] = n_nodes - # now save to main DataFrame - df[n_estimator] = s.copy() - # Set index to be the estimator number - df = df.T - # Save the core data on the estimators - filename = filename_str.format( model_name, depth2investigate, '_ALL', '') - df.to_csv( filename+'csv' ) - # --- Print a summary to a file screen - dfs = {} - for node_num in sorted( num2node.keys() ): - # get index of node of interest - idx = num2node[node_num] - vars_ = [i for i in df.columns if 'N{:0>4}'.format(node_num) in i ] - # get values of inteest for nodes - FEATvar = [i for i in vars_ if 'feature' in i][0] - THRESvar = [i for i in vars_ if 'threshold' in i][0] - SAMPLEvar = [i for i in vars_ if 'n_node_samples' in i][0] -# RHChildvar = [i for i in vars_ if 'RH child' in i][0] -# LHChildvar = [i for i in vars_ if 'LH child' in i][0] -# print FEATvar, THRESvar - # Get value counts - val_cnts = df[FEATvar].value_counts() - df_tmp = pd.DataFrame( val_cnts ) - # Store the features and rename the # of tress column - df_tmp['feature'] = df_tmp.index - df_tmp.rename( columns={FEATvar:'# of trees'}, inplace=True ) - # Calc percent - df_tmp['%'] = val_cnts.values / float(val_cnts.sum()) *100. - # Save the children for node -# df_tmp['RH child'] = df[RHChildvar][idx] -# df_tmp['LH child'] = df[LHChildvar][idx] - # intialise series objects to store stats - s_mean = pd.Series() - s_median = pd.Series() - s_std = pd.Series() - node_feats = list(df_tmp.index) - s_samples_mean = pd.Series() - s_samples_median = pd.Series() - # Now loop and get values for features - for feat_ in node_feats: - # - Get threshold value for node + stats on this - thres_val4node = df[THRESvar].loc[ df[FEATvar]==feat_] - # make sure the value is a float - thres_val4node = thres_val4node.astype(np.float) - # convert Kelvin to degrees for readability - if feat_ == 'WOA_TEMP_K': - thres_val4node = thres_val4node -273.15 - # exact stats of interest - stats_ = thres_val4node.describe().T - s_mean[feat_] = stats_['mean'] - s_median[feat_] = stats_['50%'] - s_std[feat_] = stats_['std'] - # - also get avg. samples - sample_val4node = df[SAMPLEvar].loc[ df[FEATvar]==feat_] - # make sure the value is a float - sample_val4node = sample_val4node.astype(np.float) - stats_ = sample_val4node.describe().T - s_samples_mean = stats_['mean'] - s_samples_median = stats_['50%'] - # Add stats to tmp DataFrame - df_tmp['std'] = s_std - df_tmp['median'] = s_median - df_tmp['mean'] = s_mean - # set the depth value for each node_num - if node_num == 0: - depth = node_num - elif node_num in range(1,3): - depth = 1 - elif node_num in range(3,3+(2**2) ): - depth = 2 - elif node_num in range(7,7+(3**2) ): - depth = 3 - elif node_num in range(16,16+(4**2)): - depth = 4 - elif node_num in range(32,32+(5**2)): - depth = 5 - elif node_num in range(57,57+(6**2)): - depth = 6 - elif node_num in range(93,93+(7**2)): - depth = 7 - elif node_num in range(129,129+(8**2)): - depth = 8 - else: - print( 'Depth not setup for > n+8' ) - sys.exit() - df_tmp['depth'] = depth - df_tmp['node #'] = node_num - df_tmp['# samples (mean)'] = s_samples_mean - df_tmp['# samples (median)'] = s_samples_median - # Set the index to just a range - df_tmp.index = range( len(df_tmp.index) ) - # Save to main DataFrame - dfs[node_num] = df_tmp.copy() - # loop and save info to files - filename = filename_str.format( model_name, depth2investigate, '', 'txt') - a = open( filename, 'w' ) - for depth in range(depth2investigate): - # print summary - header = '--- At depth {:0>3}:'.format( depth ) - if verbose: - print( header ) - print( dfs[depth] ) - # save - print( header, file=a) - print( dfs[depth], file=a) - # close file to save data - a.close() - # --- Build a DataFrame with details on a node by node basis - # combine by node - keys = sorted( dfs.keys() ) - dfn = dfs[ keys[0] ].append( [dfs[i] for i in keys[1:] ] ) - # re index and order by - dfn.index = range( len(dfn.index ) ) - dfn.sort_values(by=['node #'], ascending=True, inplace=True) - filename = filename_str.format( model_name, depth2investigate, '', 'csv') - dfn.to_csv( filename ) - - -if __name__ == "__main__": - main() -