NVIDIA · yidong72 · Jun 20, 2019 · Jun 18, 2019 · Jun 18, 2019 · Jun 19, 2019
diff --git a/gquant/dataframe_flow/node.py b/gquant/dataframe_flow/node.py
@@ -7,7 +7,29 @@
 import dask
 
 
-__all__ = ['Node']
+__all__ = ['Node', 'TaskSpecSchema']
+
+
+class TaskSpecSchema(object):
+    '''Outline fields expected in a dictionary specifying a task node.
+
+    :ivar id: unique id or name for the node
+    :ivar plugin_type: Plugin class i.e. subclass of Node. Specified as string
+        or subclass of Node
+    :ivar conf: Configuration for the plugin i.e. parameterization. This is a
+        dictionary.
+    :ivar modulepath: Path to python module for custom plugin types.
+    :ivar inputs: List of ids of other tasks or an empty list.
+    '''
+
+    uid = 'id'
+    plugin_type = 'type'
+    conf = 'conf'
+    modulepath = 'filepath'
+    inputs = 'inputs'
+
+    # load = 'load'
+    # save = 'save'
 
 
 class Node(object):

diff --git a/gquant/dataframe_flow/workflow.py b/gquant/dataframe_flow/workflow.py
@@ -6,7 +6,7 @@
 
 from .node import Node
 
-__all__ = ['run', 'save_workflow', 'load_workflow', 'viz_graph', 'get_graph']
+__all__ = ['run', 'save_workflow', 'load_workflow', 'viz_graph', 'build_workflow']
 
 
 DEFAULT_MODULE = "gquant.plugin_nodes"
@@ -109,72 +109,82 @@ def __find_roots(node, inputs, consider_load=True):
         __find_roots(i, inputs, consider_load)
 
 
-def get_graph(obj, replace):
+def build_workflow(task_list, replace=None):
     """
     compute the graph structure of the nodes. It will set the input and output
     nodes for each of the node
 
     Arguments
     -------
-    obj: list
-        a list of Python object that defines the nodes
+    task_list: list
+        A list of Python dicts. Each dict is a task node spec.
     replace: dict
         conf parameters replacement
+
     Returns
     -----
     dict
         keys are Node unique ids
-        values are Node objects
+        values are instances of Node subclasses i.e. plugins.
 
     """
-
-    obj_dict = {}
-    conf_dict = {}
+    replace = dict() if replace is None else replace
+    task_dict = {}
+    task_spec_dict = {}
     # instantiate objects
-    for o in obj:
-        if o['id'] in replace:
-            o = copy.deepcopy(o)
-            o.update(replace[o['id']])
-        if isinstance(o['type'], str):
-            if 'filepath' in o:
-                spec = importlib.util.spec_from_file_location(o['id'],
-                                                              o['filepath'])
+    for task_spec in task_list:
+        if task_spec['id'] in replace:
+            task_spec = copy.deepcopy(task_spec)
+            task_spec.update(replace[task_spec['id']])
+
+        if isinstance(task_spec['type'], str):
+            if 'filepath' in task_spec:
+                spec = importlib.util.spec_from_file_location(task_spec['id'],
+                                                              task_spec['filepath'])
                 mod = importlib.util.module_from_spec(spec)
                 spec.loader.exec_module(mod)
-                NodeClass = getattr(mod, o['type'])
+                NodeClass = getattr(mod, task_spec['type'])
             else:
-                NodeClass = getattr(mod_lib, o['type'])
-        elif issubclass(o['type'], Node):
-            NodeClass = o['type']
+                NodeClass = getattr(mod_lib, task_spec['type'])
+        elif issubclass(task_spec['type'], Node):
+            NodeClass = task_spec['type']
         else:
             raise "Not supported"
+
         load = False
         save = False
-        if 'load' in o:
-            load = o['load']
-        if 'save' in o:
-            save = o['save']
-        instance = NodeClass(o['id'], o['conf'], load, save)
-        obj_dict[o['id']] = instance
-        conf_dict[o['id']] = o
+
+        if 'load' in task_spec:
+            load = task_spec['load']
+
+        if 'save' in task_spec:
+            save = task_spec['save']
+
+        instance = NodeClass(task_spec['id'], task_spec['conf'], load, save)
+        task_dict[task_spec['id']] = instance
+        task_spec_dict[task_spec['id']] = task_spec
+
     # build the graph
-    for key in obj_dict:
-        instance = obj_dict[key]
-        for input_id in conf_dict[key]['inputs']:
-            input_instance = obj_dict[input_id]
+    for task_id in task_dict:
+        instance = task_dict[task_id]
+        for input_id in task_spec_dict[task_id]['inputs']:
+            input_instance = task_dict[input_id]
             instance.inputs.append(input_instance)
             input_instance.outputs.append(instance)
 
     # this part is to do static type checks
     raw_inputs = []
-    for k in obj_dict.keys():
-        __find_roots(obj_dict[k], raw_inputs, consider_load=False)
+    for k in task_dict.keys():
+        __find_roots(task_dict[k], raw_inputs, consider_load=False)
+
     for i in raw_inputs:
         i.columns_flow()
+
     # clean up the visited status for run computations
-    for key in obj_dict:
-        obj_dict[key].visited = False
-    return obj_dict
+    for task_id in task_dict:
+        task_dict[task_id].visited = False
+
+    return task_dict
 
 
 def viz_graph(obj):
@@ -197,7 +207,7 @@ def viz_graph(obj):
     return G
 
 
-def run(obj, outputs, replace={}):
+def run(obj, outputs, replace=None):
     """
     Flow the dataframes in the graph to do the data science computations.
 
@@ -206,40 +216,44 @@ def run(obj, outputs, replace={}):
     obj: list
         a list of Python object that defines the nodes
     outputs: list
-        a list of the leaf nodes that we need the final results
+        a list of the leaf node IDs for which to return the final results
     replace: list
         a dict that defines the conf parameters replacement
+
     Returns
     -----
     tuple
         the results corresponding to the outputs list
     """
-
-    obj_dict = get_graph(obj, replace)
+    replace = dict() if replace is None else replace
+    task_dict = build_workflow(obj, replace)
     output_node = Node('unique_output', {})
     # want to save the intermediate results
     output_node.clear_input = False
     results = []
     results_obj = []
     for o in outputs:
-        o_obj = obj_dict[o]
+        o_obj = task_dict[o]
         results_obj.append(o_obj)
         output_node.inputs.append(o_obj)
         o_obj.outputs.append(output_node)
 
     inputs = []
     __find_roots(output_node, inputs, consider_load=True)
     # now clean up the graph, removed the node that is not used for computation
-    for key in obj_dict:
-        current_obj = obj_dict[key]
+    for key in task_dict:
+        current_obj = task_dict[key]
         if not current_obj.visited:
             for i in current_obj.inputs:
                 i.outputs.remove(current_obj)
             current_obj.inputs = []
+
     for i in inputs:
         i.flow()
+
     for r_obj in results_obj:
         results.append(output_node.input_df[r_obj])
+
     # clean the results afterwards
     output_node.input_df = {}
     return tuple(results)